示例#1
0
    def test_should_be_able_to_set_property(self):
        network_config = NetworkConfig()

        self.assertEqual(network_config.input_shape, [10])

        network_config.input_shape = [5, 5]

        self.assertEqual(network_config.input_shape, [5, 5])

        self.assertEqual(network_config.layers, [100, 100])

        network_config.layers = [10]

        self.assertEqual(network_config.layers, [10])
示例#2
0
    def test_should_be_able_to_set_property(self):
        network_config = NetworkConfig()

        self.assertEqual(network_config.input_shape, [10])

        network_config.input_shape = [5, 5]

        self.assertEqual(network_config.input_shape, [5, 5])

        network_config.layers = [{"type": "FC", "neurons": 10}]
        self.assertEqual(network_config.layers, [{
            "type": "FC",
            "neurons": 10
        }])
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument(
        '-f', '--folder',
        help='The folder containing the config files',
        required=True
    )
    parser.add_argument(
        '-m', '--map',
        help='Run the specified map',
        required=False
    )
    parser.add_argument(
        '--eval',
        help="Run only evaluation task",
        dest='eval',
        action="store_true"
    )

    parser.add_argument(
        '-r', '--render',
        help="Render task",
        dest='render',
        action="store_true"
    )
    
    args = parser.parse_args()

    evaluation_config_path = os.path.join(args.folder, "evaluation.yml")
    evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path)

    network_config_path = os.path.join(args.folder, "network.yml")
    network_config = NetworkConfig.load_from_yaml(network_config_path)

    reinforce_config_path = os.path.join(args.folder, "reinforce.yml")
    reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path)

    map_name = args.map
    if map_name is None:
        print("You are traning the agent for the default map: ")
        print("FourTowersWithFriendlyUnitsFixedEnemiesFixedPosition")
    else:
        print("You are traning the agent for the map: ")
        print(map_name)
    
    #print(map_name)
    if args.eval:
        evaluation_config.training_episodes = 0
        network_config.restore_network = True

    if args.render:
        evaluation_config.render = True


    run_task(evaluation_config, network_config, reinforce_config, map_name = map_name)

    return 0
示例#4
0
    def test_should_be_able_to_access_nested_properties(self):
        network_config = NetworkConfig.load_from_yaml(test_file)

        self.assertEqual(len(network_config.networks), 4)

        up_network = network_config.networks[0]
        down_network = network_config.networks[1]

        self.assertEqual(up_network['input_shape'], [110])
        self.assertEqual(up_network['output_shape'], [4])

        self.assertEqual(down_network['input_shape'], [110])
        self.assertEqual(down_network['output_shape'], [4])
示例#5
0
    def test_should_have_default_values(self):
        network_config = NetworkConfig()

        self.assertEqual(network_config.input_shape, [10])
        self.assertEqual(network_config.output_shape, [5])

        self.assertEqual(network_config.restore_network, True)
        self.assertEqual(network_config.network_path, None)

        self.assertEqual(network_config.summaries_path, None)
        self.assertEqual(network_config.summaries_step, 100)

        self.assertEqual(network_config.aggregator, "average")
示例#6
0
    def test_should_create_network_config_object_from_file(self):
        network_config = NetworkConfig.load_from_yaml(test_file)

        self.assertEqual(network_config.input_shape, [20])
        self.assertEqual(network_config.output_shape, [5])

        self.assertEqual(network_config.restore_network, False)
        self.assertEqual(network_config.network_path,
                         "test/the/network/path.ckpt")

        self.assertEqual(network_config.summaries_path,
                         "test/summaries/path.ckpt")
        self.assertEqual(network_config.summaries_step, 50)

        self.assertEqual(network_config.aggregator, "average")
示例#7
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f',
                        '--folder',
                        help='The folder containing the config files',
                        required=True)

    # TODO Better way to load the task to run
    parser.add_argument(
        '-t',
        '--task',
        help="The task to run. The python module cointaining the ABP program",
        required=True)

    parser.add_argument(
        '-d',
        '--debug',
        help="Print lots of debugging statements",
        action="store_const",
        dest="loglevel",
        const=logging.DEBUG,
        default=logging.WARNING,
    )

    parser.add_argument(
        '-v',
        '--verbose',
        help="Be verbose. Logging level INFO",
        action="store_const",
        dest="loglevel",
        const=logging.INFO,
    )

    parser.add_argument('--eval',
                        help="Run only evaluation task",
                        dest='eval',
                        action="store_true")

    parser.add_argument('-r',
                        '--render',
                        help="Render task",
                        dest='render',
                        action="store_true")

    args = parser.parse_args()

    logger.setLevel(args.loglevel)

    evaluation_config_path = os.path.join(args.folder, "evaluation.yml")
    evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path)

    network_config_path = os.path.join(args.folder, "network.yml")
    network_config = NetworkConfig.load_from_yaml(network_config_path)

    reinforce_config_path = os.path.join(args.folder, "reinforce.yml")
    reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path)

    if args.eval:
        evaluation_config.training_episodes = 0
        network_config.restore_network = True

    if args.render:
        evaluation_config.render = True

    task_module = import_module(args.task)

    task_module.run_task(evaluation_config, network_config, reinforce_config)

    return 0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f',
                        '--folder',
                        help='The folder containing the config files',
                        required=True)
    parser.add_argument('-m',
                        '--map',
                        help='Run the specified map',
                        required=False)
    parser.add_argument('-t',
                        '--test',
                        help='Just run test, no train',
                        dest='test',
                        action="store_true")

    parser.add_argument(
        '-tf',
        '--train_forever',
        help=
        'After reach optimal policy. No need more eposdes, train the agent train_forever',
        dest='train_forever',
        action="store_true")
    parser.add_argument('--eval',
                        help="Run only evaluation task",
                        dest='eval',
                        action="store_true")

    parser.add_argument('-r',
                        '--render',
                        help="Render task",
                        dest='render',
                        action="store_true")

    args = parser.parse_args()

    #print(args)
    #input()
    evaluation_config_path = os.path.join(args.folder, "evaluation.yml")
    evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path)

    network_config_path = os.path.join(args.folder, "network.yml")
    network_config = NetworkConfig.load_from_yaml(network_config_path)

    reinforce_config_path = os.path.join(args.folder, "reinforce.yml")
    reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path)

    if args.test:
        evaluation_config.training_episodes = 0
    map_name = args.map
    if map_name is None:
        print("You are traning the agent for the default map: ")
        print("FourTowesFriendlyunitsDecomposedGroupReward")
    else:
        print("You are traning the agent for the map: ")
        print(map_name)

    #print(map_name)
    if args.eval:
        evaluation_config.training_episodes = 0
        network_config.restore_network = True

    if args.render:
        evaluation_config.render = True

    run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=map_name,
             train_forever=args.train_forever)

    return 0
示例#9
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument('-f',
                        '--folder',
                        help='The folder containing the config files',
                        required=True)
    parser.add_argument('-m',
                        '--map',
                        help='Run the specified map',
                        required=False)

    parser.add_argument('-tk',
                        '--task',
                        help="which task to run",
                        dest='task',
                        required=False)

    parser.add_argument('--model',
                        help="which model for agent",
                        dest='agent_model',
                        required=False)

    parser.add_argument('-t',
                        '--test',
                        help='Just run test, no train',
                        dest='test',
                        action="store_true")

    parser.add_argument('-ce',
                        '--collecting_experience',
                        help='Just run test and collect experience',
                        dest='collecting_experience',
                        action="store_true")

    parser.add_argument(
        '-tf',
        '--train_forever',
        help=
        'After reach optimal policy. No need more eposdes, train the agent train_forever',
        dest='train_forever',
        action="store_true")
    parser.add_argument('--eval',
                        help="Run only evaluation task",
                        dest='eval',
                        action="store_true")

    parser.add_argument('-r',
                        '--render',
                        help="Render task",
                        dest='render',
                        action="store_true")

    args = parser.parse_args()

    evaluation_config_path = os.path.join(args.folder, "evaluation.yml")
    evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path)

    network_config_path = os.path.join(args.folder, "network.yml")
    network_config = NetworkConfig.load_from_yaml(network_config_path)

    reinforce_config_path = os.path.join(args.folder, "reinforce.yml")
    reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path)

    if args.test:
        evaluation_config.training_episodes = 0
    if args.collecting_experience:
        reinforce_config.collecting_experience = True

    map_name = args.map
    if map_name is None:
        print("You are traning the agent for the default map: ")
        print("TugOfWar")
    else:
        print("You are traning the agent for the map: ")
        print(map_name)

    #print(map_name)
    if args.eval:
        evaluation_config.training_episodes = 0
        network_config.restore_network = True

    if args.render:
        evaluation_config.render = True

    if args.task == 'task_bigA':
        run_task_bigA(evaluation_config,
                      network_config,
                      reinforce_config,
                      map_name=map_name,
                      train_forever=args.train_forever)
    elif args.task == 'task_single_player':
        run_task(evaluation_config,
                 network_config,
                 reinforce_config,
                 map_name=map_name,
                 train_forever=args.train_forever)
    elif args.task == 'task_2p':
        run_task_2p(evaluation_config,
                    network_config,
                    reinforce_config,
                    map_name=map_name,
                    train_forever=args.train_forever)
    elif args.task == 'task_mbts':
        run_task_mbts(evaluation_config,
                      network_config,
                      reinforce_config,
                      map_name=map_name,
                      train_forever=args.train_forever)
    elif args.task == 'task_2p_2l':
        run_task_2p_2l(evaluation_config,
                       network_config,
                       reinforce_config,
                       map_name=map_name,
                       train_forever=args.train_forever)
    elif args.task == 'task_2p_2l_grid':
        run_task_2p_2l_grid(evaluation_config,
                            network_config,
                            reinforce_config,
                            map_name=map_name,
                            train_forever=args.train_forever)
    elif args.task == 'task_2p_2l_hp':
        run_task_2p_2l_hp(evaluation_config,
                          network_config,
                          reinforce_config,
                          map_name=map_name,
                          train_forever=args.train_forever,
                          agent_model=args.agent_model)
    elif args.task == 'task_2p_2l_deexplanation':
        run_task_2p_2l_deexplanation(evaluation_config,
                                     network_config,
                                     reinforce_config,
                                     map_name=map_name,
                                     agent_model=args.agent_model)
    elif args.task == 'task_2p_2l_grid_decomposed':
        run_task_2p_2l_grid_decomposed(evaluation_config,
                                       network_config,
                                       reinforce_config,
                                       map_name=map_name)
    elif args.task == 'task_2p_2l_grid_decomposed_trans':
        run_task_2p_2l_grid_decomposed_trans(evaluation_config,
                                             network_config,
                                             reinforce_config,
                                             map_name=map_name)
    elif args.task == 'task_mbts_grid':
        run_task_mbts_grid(evaluation_config,
                           network_config,
                           reinforce_config,
                           map_name=map_name)
    else:
        print("need task")
    return 0
示例#10
0
def run_task(evaluation_config,
             network_config,
             reinforce_config,
             map_name=None,
             train_forever=False):
    if (use_cuda):
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|       USING CUDA       |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    else:
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
        print("|     NOT USING CUDA     |")
        print("~~~~~~~~~~~~~~~~~~~~~~~~~~")
    flags.FLAGS(sys.argv[:1])
    max_episode_steps = 40

    replay_dimension = evaluation_config.xai_replay_dimension
    env = TugOfWar(map_name = map_name, \
        generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension)

    reward_types = env.reward_types
    combine_sa = env.combine_sa
    state_1, state_2 = env.reset()

    if network_config.output_shape == 4:
        reward_num = 4
        combine_decomposed_func = combine_decomposed_func_4
        player_1_end_vector = player_1_end_vector_4

    if network_config.output_shape == 8:
        reward_num = 8
        combine_decomposed_func = combine_decomposed_func_8
        player_1_end_vector = player_1_end_vector_8

    if network_config.output_shape == 1:
        reward_num = 1
        combine_decomposed_func = combine_decomposed_func_1
        player_1_end_vector = player_1_end_vector_1

    if not reinforce_config.is_random_agent_1:
        agent_1 = SADQAdaptive(name="TugOfWar",
                               state_length=len(state_1),
                               network_config=network_config,
                               reinforce_config=reinforce_config,
                               reward_num=reward_num,
                               combine_decomposed_func=combine_decomposed_func)
        print("sadq agent 1")
    else:
        print("random agent 1")

    training_summaries_path = evaluation_config.summaries_path + "/train"
    clear_summary_path(training_summaries_path)
    train_summary_writer = SummaryWriter(training_summaries_path)

    test_summaries_path = evaluation_config.summaries_path + "/test"
    clear_summary_path(test_summaries_path)
    test_summary_writer = SummaryWriter(test_summaries_path)

    agents_2 = ["random", "random_2"]

    round_num = 0

    privous_result = []
    update_wins_waves = 10

    all_experiences = []
    path = './saved_models/tug_of_war/agents/grid'
    exp_save_path = 'abp/examples/pysc2/tug_of_war/rand_v_rand.pt'
    if reinforce_config.collecting_experience and not reinforce_config.is_random_agent_2:
        agent_1_model = "TugOfWar_eval.pupdate_240"
        exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt'
        for r, d, f in os.walk(path):
            for file in f:
                if '.p' in file:
                    new_weights = torch.load(path + "/" + file)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)

                    new_agent_2.load_weight(new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)

                    if agent_1_model == file:
                        print("********agent_1_model", file)
                        agent_1.load_model(new_agent_2.eval_model)

    elif network_config.restore_network:
        restore_path = network_config.network_path
        for r, d, f in os.walk(restore_path):
            f = sorted(f)
            for file in f:
                if 'eval.pupdate' in file or 'eval.p_the_best' in file:
                    new_weights = torch.load(restore_path + "/" + file)
                    new_agent_2 = SADQAdaptive(
                        name=file,
                        state_length=len(state_1),
                        network_config=network_config,
                        reinforce_config=reinforce_config,
                        memory_resotre=False,
                        reward_num=reward_num,
                        combine_decomposed_func=combine_decomposed_func)
                    new_agent_2.load_weight(new_weights)
                    new_agent_2.disable_learning(is_save=False)
                    agents_2.append(new_agent_2)
                    print("loaded agent:", file)
#     agent_1.steps = reinforce_config.epsilon_timesteps / 2
    if evaluation_config.generate_xai_replay:

        agent_1_model = "TugOfWar_eval.pupdate_600"
        agent_2_model = "TugOfWar_eval.pupdate_560"

        agents_2 = []
        if use_cuda:
            weights_1 = torch.load(path + "/" + agent_1_model)
            weights_2 = torch.load(path + "/" + agent_2_model)
        else:
            weights_1 = torch.load(path + "/" + agent_1_model,
                                   map_location=lambda storage, loc: storage)
            weights_2 = torch.load(path + "/" + agent_2_model,
                                   map_location=lambda storage, loc: storage)

        new_agent_2 = SADQAdaptive(
            name="record",
            state_length=len(state_1),
            network_config=network_config,
            reinforce_config=reinforce_config,
            memory_resotre=False,
            reward_num=reward_num,
            combine_decomposed_func=combine_decomposed_func)
        agent_1.load_weight(weights_1)
        new_agent_2.load_weight(weights_2)
        new_agent_2.disable_learning(is_save=False)
        agents_2.append(new_agent_2)

    if reinforce_config.is_use_sepcific_enemy:
        sepcific_SADQ_enemy_weights = torch.load(reinforce_config.enemy_path)

        sepcific_network_config = NetworkConfig.load_from_yaml(
            "./tasks/tug_of_war/sadq_2p_2l_decom/v2_8/network.yml")
        sepcific_network_config.restore_network = False
        sepcific_SADQ_enemy = SADQAdaptive(
            name="sepcific enemy",
            state_length=len(state_1),
            network_config=sepcific_network_config,
            reinforce_config=reinforce_config,
            memory_resotre=False,
            reward_num=sepcific_network_config.output_shape,
            combine_decomposed_func=combine_decomposed_func_8)

        sepcific_SADQ_enemy.load_weight(sepcific_SADQ_enemy_weights)
        sepcific_SADQ_enemy.disable_learning(is_save=False)
        agents_2 = [sepcific_SADQ_enemy]

    while True:
        print(sum(np.array(privous_result) >= 0.9))
        if len(privous_result) >= update_wins_waves and \
        sum(np.array(privous_result) >= 0.9) >= update_wins_waves and \
        not reinforce_config.is_random_agent_2 and not reinforce_config.is_use_sepcific_enemy:
            privous_result = []
            print("replace enemy agent's weight with self agent")
            #             random_enemy = False
            f = open(evaluation_config.result_path, "a+")
            f.write("Update agent\n")
            f.close()

            new_agent_2 = SADQAdaptive(
                name="TugOfWar_" + str(round_num),
                state_length=len(state_2),
                network_config=network_config,
                reinforce_config=reinforce_config,
                memory_resotre=False,
                reward_num=reward_num,
                combine_decomposed_func=combine_decomposed_func)

            new_agent_2.load_model(agent_1.eval_model)
            new_agent_2.disable_learning(is_save=False)
            agents_2.append(new_agent_2)
            agent_1.steps = reinforce_config.epsilon_timesteps / 2
            agent_1.best_reward_mean = 0
            agent_1.save(force=True, appendix="update_" + str(round_num))

        round_num += 1

        print(
            "======================================================================="
        )
        print(
            "===============================Now training============================"
        )
        print(
            "======================================================================="
        )
        print("Now training.")

        print("Now have {} enemy".format(len(agents_2)))

        for idx_enemy, enemy_agent in enumerate(agents_2):
            #             break
            if reinforce_config.collecting_experience:
                break
            if type(enemy_agent) == type("random"):
                print(enemy_agent)
            else:
                print(enemy_agent.name)

            if idx_enemy == len(agents_2) - 1:
                training_num = evaluation_config.training_episodes
            else:
                training_num = 10

            for episode in tqdm(range(training_num)):
                #                 if type(enemy_agent) == type("random"):
                #                     break
                state_1, state_2 = env.reset()
                total_reward = 0
                skiping = True
                done = False
                steps = 0
                #             print(list(state_1))
                #             print(list(state_2))

                while skiping:
                    state_1, state_2, done, dp = env.step([], 0)
                    if dp or done:
                        break
                last_mineral = state_1[env.miner_index]
                while not done and steps < max_episode_steps:
                    steps += 1
                    #                     w += 1
                    #                     print(w)
                    # Decision point
                    #                 print('state:')
                    #                 print("=======================================================================")
                    # pretty_print(state_1, text = "state 1")
                    # pretty_print(state_2, text = "state 2")
                    if agent_1.steps < reinforce_config.epsilon_timesteps:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=1)
                    else:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=0)
                    actions_2 = env.get_big_A(state_2[env.miner_index],
                                              state_2[env.pylon_index],
                                              is_train=1)

                    assert state_1[-1] == state_2[-1] == steps, print(
                        state_1, state_2, steps)
                    if not reinforce_config.is_random_agent_1:
                        combine_states_1 = combine_sa(state_1, actions_1)
                        #                     print(combine_states_1)
                        #                     print(env.normalization(combine_states_1))
                        #                     print(state_1[env.miner_index])
                        choice_1, _ = agent_1.predict(
                            env.normalization(combine_states_1))
    #                     input()
    #                     for cs1 in combine_states_1:
    #                         print(cs1.tolist())
                    else:
                        #                     combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1 = randint(0, len(actions_1) - 1)

                    if not reinforce_config.is_random_agent_2 and type(
                            enemy_agent) != type("random"):
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2, _ = enemy_agent.predict(
                            env.normalization(combine_states_2))
                    else:
                        if enemy_agent == "random_2":
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index])
                        choice_2 = randint(0, len(actions_2) - 1)

    #                 print("action list:")
    #                 print(actions_1)
    #                 print(actions_2)
    #                 assign action
#                     print("choice:")
#                     print(actions_1[choice_1])
#                 print(actions_2[choice_2])
#                     pretty_print(combine_states_1[choice_1], text = "after state:")
#                 input("pause")
#                 print(combine_states_2[choice_2].tolist())
#                 if state_1[env.miner_index] > 300:
#                     input('pause')
                    env.step(list(actions_1[choice_1]), 1)
                    env.step(list(actions_2[choice_2]), 2)
                    #                     if steps == 39:
                    #                         env.step([3,0,0,0,0,0,0], 1)

                    last_mineral = combine_states_1[choice_1][env.miner_index]

                    l_m_1 = state_1[env.miner_index]
                    l_m_2 = state_2[env.miner_index]

                    while skiping:
                        state_1, state_2, done, dp = env.step([], 0)
                        #                     input('time_step')
                        if dp or done:
                            break

#                     Check if the mineral is correct
#                     if not done and steps < max_episode_steps and type(enemy_agent) != type("random"):
#                         next_mineral_1 = combine_states_1[choice_1][env.miner_index] + 100 + combine_states_1[choice_1][env.pylon_index] * 75
# #                         if type(enemy_agent) != type("random"):
#                         next_mineral_2 = combine_states_2[choice_2][env.miner_index] + 100 + combine_states_2[choice_2][env.pylon_index] * 75
#                         if next_mineral_1 > 1500:
#                             next_mineral_1 = 1500
#                         if next_mineral_2 > 1500:
#                             next_mineral_2 = 1500

#                         print(next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1])

# #                         if type(enemy_agent) != type("random"):
#                         print(next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2])
#                         assert next_mineral_1 == state_1[env.miner_index], print(l_m_1, next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1])
# #                         if type(enemy_agent) != type("random"):
#                         assert next_mineral_2 == state_2[env.miner_index], print(l_m_2, next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2])

                    reward = [0] * reward_num
                    if steps == max_episode_steps or done:
                        reward = player_1_end_vector(state_1[63],
                                                     state_1[64],
                                                     state_1[65],
                                                     state_1[66],
                                                     is_done=done)

#                     reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards)
#                     print('reward:')
# print(state_1[27], state_1[28], state_1[29], state_1[30])
#                     print(reward_1)
#                     print(reward_2)
#                     if steps == max_episode_steps or done:
#                         input()

                    if not reinforce_config.is_random_agent_1:
                        agent_1.reward(reward)

                if not reinforce_config.is_random_agent_1:
                    agent_1.end_episode(env.normalization(state_1))

#                 test_summary_writer.add_scalar(tag = "Train/Episode Reward", scalar_value = total_reward,
#                                                global_step = episode + 1)
#                 train_summary_writer.add_scalar(tag = "Train/Steps to choosing Enemies", scalar_value = steps + 1,
#                                                 global_step = episode + 1)

        if not reinforce_config.is_random_agent_1:
            agent_1.disable_learning(
                is_save=not reinforce_config.collecting_experience
                and not evaluation_config.generate_xai_replay)

        total_rewwards_list = []

        # Test Episodes
        print(
            "======================================================================"
        )
        print(
            "===============================Now testing============================"
        )
        print(
            "======================================================================"
        )

        tied_lose = 0
        for idx_enemy, enemy_agent in enumerate(agents_2):
            average_end_state = np.zeros(len(state_1))
            if type(enemy_agent) == type("random"):
                print(enemy_agent)
            else:
                print(enemy_agent.name)

            if idx_enemy == len(
                    agents_2
            ) - 1 and not reinforce_config.collecting_experience:
                test_num = evaluation_config.test_episodes
            else:
                test_num = 5

            for episode in tqdm(range(test_num)):
                env.reset()
                total_reward_1 = 0
                done = False
                skiping = True
                steps = 0
                previous_state_1 = None
                previous_state_2 = None
                previous_action_1 = None
                previous_action_2 = None
                if evaluation_config.generate_xai_replay:
                    recorder = XaiReplayRecorder2LaneNexus(
                        env.sc2_env, episode, evaluation_config.env,
                        action_component_names, replay_dimension)

#                 print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Starting episode%%%%%%%%%%%%%%%%%%%%%%%%%")
#                 print(f"reinforce_config.collecting_experience {reinforce_config.collecting_experience}")
                while skiping:
                    #                     print("about to call env.step() during skip")
                    #                 start_time = time.time()
                    state_1, state_2, done, dp = env.step([], 0)
                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_game_clock_tick(
                            env.decomposed_reward_dict)
                    if dp or done:
                        #                     print(time.time() - start_time)
                        break
#                 input(f"dp is {dp} done is {done}")
#                 print("done stepping to finish prior action")
                while not done and steps < max_episode_steps:
                    #                     input(f"not done and steps == {steps} < {max_episode_steps}")
                    steps += 1
                    #                 # Decision point
                    if not reinforce_config.is_random_agent_1:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index])
                        combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1, _ = agent_1.predict(
                            env.normalization(combine_states_1))
                    else:
                        actions_1 = env.get_big_A(state_1[env.miner_index],
                                                  state_1[env.pylon_index],
                                                  is_train=1)
                        combine_states_1 = combine_sa(state_1, actions_1)
                        choice_1 = randint(0, len(actions_1) - 1)

                    if not reinforce_config.is_random_agent_2 and type(
                            enemy_agent) != type("random"):
                        actions_2 = env.get_big_A(state_2[env.miner_index],
                                                  state_2[env.pylon_index])
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2, _ = enemy_agent.predict(
                            env.normalization(combine_states_2))
                    else:
                        if enemy_agent == "random_2":
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index],
                                                      is_train=0)
                        else:
                            actions_2 = env.get_big_A(state_2[env.miner_index],
                                                      state_2[env.pylon_index],
                                                      is_train=1)
                        combine_states_2 = combine_sa(state_2, actions_2)
                        choice_2 = randint(0, len(actions_2) - 1)

#                     input("record dp if engaged")
                    if evaluation_config.generate_xai_replay:
                        #recorder.save_jpg()
                        recorder.record_decision_point(
                            actions_1[choice_1], actions_2[choice_2], state_1,
                            state_2, env.decomposed_reward_dict)

    #                 input('stepped with command 2')
    #######
    #experience collecting
    ######
#                     input("collect experience if configured so")
                    if reinforce_config.collecting_experience:
                        if previous_state_1 is not None and previous_state_2 is not None and previous_action_1 is not None and previous_action_2 is not None:
                            previous_state_1[8:14] = previous_state_2[
                                1:7]  # Include player 2's action
                            previous_state_1[
                                env.miner_index] += previous_state_1[
                                    env.pylon_index] * 75 + 100
                            previous_state_1[-1] += 1

                            experience = [
                                previous_state_1,
                                np.append(state_1, previous_reward_1)
                            ]
                            all_experiences.append(experience)
                            if ((len(all_experiences)) % 100 == 0
                                ) and reinforce_config.collecting_experience:
                                torch.save(all_experiences, exp_save_path)

                        previous_state_1 = deepcopy(combine_states_1[choice_1])
                        previous_state_2 = deepcopy(combine_states_2[choice_2])

                        previous_action_1 = deepcopy(actions_1[choice_1])
                        previous_action_2 = deepcopy(actions_2[choice_2])

#                     input(f"step p1 with {list(actions_1[choice_1])}")
                    env.step(list(actions_1[choice_1]), 1)
                    #                     input(f"step p2 with {list(actions_2[choice_2])}")
                    env.step(list(actions_2[choice_2]), 2)
                    #                     # human play
                    #                     pretty_print(state_2, text = "state:")
                    #                     env.step(list(get_human_action()), 2)
                    #                     reinforce_config.collecting_experience = False

                    while skiping:
                        #                     print("Get actions time:")
                        #                     start_time = time.time()
                        #                         input("step to move the game along and send the wave")
                        state_1, state_2, done, dp = env.step([], 0)
                        if evaluation_config.generate_xai_replay:
                            #recorder.save_jpg()
                            recorder.record_game_clock_tick(
                                env.decomposed_reward_dict)
                        #input(' step wating for done signal')
                        if dp or done:
                            #                         print(time.time() - start_time)
                            break

                    reward = [0] * reward_num
                    if steps == max_episode_steps or done:
                        reward = player_1_end_vector(state_1[63],
                                                     state_1[64],
                                                     state_1[65],
                                                     state_1[66],
                                                     is_done=done)

#                     input("separate rewards...")
#                     reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards)
#                 print(env.decomposed_rewards)
#                 print(reward_1, reward_2)

#                 for r1 in reward_1:
                    if reward_num == 4:
                        current_reward_1 = sum(reward[2:])
                    elif reward_num == 8:
                        current_reward_1 = reward[2] + reward[3] + reward[
                            6] + reward[7]
                    elif reward_num == 1:
                        current_reward_1 = sum(reward)
    #                 print(current_reward_1)

                    total_reward_1 += current_reward_1
                    #                 print(total_reward_1)
                    #                 if total_reward_1 > 14000 or total_reward_1 < -14000:
                    #                     input()
                    previous_reward_1 = current_reward_1
#                 print("collect experience again if configured so")
                if reinforce_config.collecting_experience:
                    previous_state_1[8:14] = previous_state_2[
                        1:7]  # Include player 2's action
                    previous_state_1[env.miner_index] += previous_state_1[
                        env.pylon_index] * 75 + 100
                    previous_state_1[-1] += 1

                    experience = [
                        previous_state_1,
                        np.append(state_1, previous_reward_1)
                    ]
                    all_experiences.append(experience)
                    if ((len(all_experiences)) % 100
                            == 0) and reinforce_config.collecting_experience:
                        torch.save(all_experiences, exp_save_path)

                average_end_state += state_1

                total_rewwards_list.append(total_reward_1)
                test_summary_writer.add_scalar(tag="Test/Episode Reward",
                                               scalar_value=total_reward_1,
                                               global_step=episode + 1)
                test_summary_writer.add_scalar(
                    tag="Test/Steps to choosing Enemies",
                    scalar_value=steps + 1,
                    global_step=episode + 1)
    #         if reinforce_config.collecting_experience:
    #             break
    #print(test.size())
    #         print(total_rewwards_list)


#             print("should be done with episode...")
            total_rewards_list_np = np.array(total_rewwards_list)

            tied = np.sum(total_rewards_list_np[-test_num:] == 0)
            wins = np.sum(total_rewards_list_np[-test_num:] > 0)
            lose = np.sum(total_rewards_list_np[-test_num:] <= 0)

            tied_lose += (tied + lose)
            print("wins/lose/tied")
            print(
                str(wins / test_num * 100) + "% \t",
                str(lose / test_num * 100) + "% \t",
            )
            #                  str(tied / test_num * 100) + "% \t")
            pretty_print(average_end_state / test_num)

        tr = sum(total_rewwards_list) / len(total_rewwards_list)
        print("total reward:")
        print(tr)

        privous_result.append(tr)

        if len(privous_result) > update_wins_waves:
            del privous_result[0]
        f = open(evaluation_config.result_path, "a+")
        f.write(str(tr) + "\n")
        f.close()

        if tied_lose == 0 and not reinforce_config.is_random_agent_1:
            agent_1.save(force=True, appendix="_the_best")

        if not reinforce_config.is_random_agent_1:
            agent_1.enable_learning()