def test_should_be_able_to_set_property(self): network_config = NetworkConfig() self.assertEqual(network_config.input_shape, [10]) network_config.input_shape = [5, 5] self.assertEqual(network_config.input_shape, [5, 5]) self.assertEqual(network_config.layers, [100, 100]) network_config.layers = [10] self.assertEqual(network_config.layers, [10])
def test_should_be_able_to_set_property(self): network_config = NetworkConfig() self.assertEqual(network_config.input_shape, [10]) network_config.input_shape = [5, 5] self.assertEqual(network_config.input_shape, [5, 5]) network_config.layers = [{"type": "FC", "neurons": 10}] self.assertEqual(network_config.layers, [{ "type": "FC", "neurons": 10 }])
def main(): parser = argparse.ArgumentParser() parser.add_argument( '-f', '--folder', help='The folder containing the config files', required=True ) parser.add_argument( '-m', '--map', help='Run the specified map', required=False ) parser.add_argument( '--eval', help="Run only evaluation task", dest='eval', action="store_true" ) parser.add_argument( '-r', '--render', help="Render task", dest='render', action="store_true" ) args = parser.parse_args() evaluation_config_path = os.path.join(args.folder, "evaluation.yml") evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path) network_config_path = os.path.join(args.folder, "network.yml") network_config = NetworkConfig.load_from_yaml(network_config_path) reinforce_config_path = os.path.join(args.folder, "reinforce.yml") reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path) map_name = args.map if map_name is None: print("You are traning the agent for the default map: ") print("FourTowersWithFriendlyUnitsFixedEnemiesFixedPosition") else: print("You are traning the agent for the map: ") print(map_name) #print(map_name) if args.eval: evaluation_config.training_episodes = 0 network_config.restore_network = True if args.render: evaluation_config.render = True run_task(evaluation_config, network_config, reinforce_config, map_name = map_name) return 0
def test_should_be_able_to_access_nested_properties(self): network_config = NetworkConfig.load_from_yaml(test_file) self.assertEqual(len(network_config.networks), 4) up_network = network_config.networks[0] down_network = network_config.networks[1] self.assertEqual(up_network['input_shape'], [110]) self.assertEqual(up_network['output_shape'], [4]) self.assertEqual(down_network['input_shape'], [110]) self.assertEqual(down_network['output_shape'], [4])
def test_should_have_default_values(self): network_config = NetworkConfig() self.assertEqual(network_config.input_shape, [10]) self.assertEqual(network_config.output_shape, [5]) self.assertEqual(network_config.restore_network, True) self.assertEqual(network_config.network_path, None) self.assertEqual(network_config.summaries_path, None) self.assertEqual(network_config.summaries_step, 100) self.assertEqual(network_config.aggregator, "average")
def test_should_create_network_config_object_from_file(self): network_config = NetworkConfig.load_from_yaml(test_file) self.assertEqual(network_config.input_shape, [20]) self.assertEqual(network_config.output_shape, [5]) self.assertEqual(network_config.restore_network, False) self.assertEqual(network_config.network_path, "test/the/network/path.ckpt") self.assertEqual(network_config.summaries_path, "test/summaries/path.ckpt") self.assertEqual(network_config.summaries_step, 50) self.assertEqual(network_config.aggregator, "average")
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--folder', help='The folder containing the config files', required=True) # TODO Better way to load the task to run parser.add_argument( '-t', '--task', help="The task to run. The python module cointaining the ABP program", required=True) parser.add_argument( '-d', '--debug', help="Print lots of debugging statements", action="store_const", dest="loglevel", const=logging.DEBUG, default=logging.WARNING, ) parser.add_argument( '-v', '--verbose', help="Be verbose. Logging level INFO", action="store_const", dest="loglevel", const=logging.INFO, ) parser.add_argument('--eval', help="Run only evaluation task", dest='eval', action="store_true") parser.add_argument('-r', '--render', help="Render task", dest='render', action="store_true") args = parser.parse_args() logger.setLevel(args.loglevel) evaluation_config_path = os.path.join(args.folder, "evaluation.yml") evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path) network_config_path = os.path.join(args.folder, "network.yml") network_config = NetworkConfig.load_from_yaml(network_config_path) reinforce_config_path = os.path.join(args.folder, "reinforce.yml") reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path) if args.eval: evaluation_config.training_episodes = 0 network_config.restore_network = True if args.render: evaluation_config.render = True task_module = import_module(args.task) task_module.run_task(evaluation_config, network_config, reinforce_config) return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--folder', help='The folder containing the config files', required=True) parser.add_argument('-m', '--map', help='Run the specified map', required=False) parser.add_argument('-t', '--test', help='Just run test, no train', dest='test', action="store_true") parser.add_argument( '-tf', '--train_forever', help= 'After reach optimal policy. No need more eposdes, train the agent train_forever', dest='train_forever', action="store_true") parser.add_argument('--eval', help="Run only evaluation task", dest='eval', action="store_true") parser.add_argument('-r', '--render', help="Render task", dest='render', action="store_true") args = parser.parse_args() #print(args) #input() evaluation_config_path = os.path.join(args.folder, "evaluation.yml") evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path) network_config_path = os.path.join(args.folder, "network.yml") network_config = NetworkConfig.load_from_yaml(network_config_path) reinforce_config_path = os.path.join(args.folder, "reinforce.yml") reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path) if args.test: evaluation_config.training_episodes = 0 map_name = args.map if map_name is None: print("You are traning the agent for the default map: ") print("FourTowesFriendlyunitsDecomposedGroupReward") else: print("You are traning the agent for the map: ") print(map_name) #print(map_name) if args.eval: evaluation_config.training_episodes = 0 network_config.restore_network = True if args.render: evaluation_config.render = True run_task(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) return 0
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--folder', help='The folder containing the config files', required=True) parser.add_argument('-m', '--map', help='Run the specified map', required=False) parser.add_argument('-tk', '--task', help="which task to run", dest='task', required=False) parser.add_argument('--model', help="which model for agent", dest='agent_model', required=False) parser.add_argument('-t', '--test', help='Just run test, no train', dest='test', action="store_true") parser.add_argument('-ce', '--collecting_experience', help='Just run test and collect experience', dest='collecting_experience', action="store_true") parser.add_argument( '-tf', '--train_forever', help= 'After reach optimal policy. No need more eposdes, train the agent train_forever', dest='train_forever', action="store_true") parser.add_argument('--eval', help="Run only evaluation task", dest='eval', action="store_true") parser.add_argument('-r', '--render', help="Render task", dest='render', action="store_true") args = parser.parse_args() evaluation_config_path = os.path.join(args.folder, "evaluation.yml") evaluation_config = EvaluationConfig.load_from_yaml(evaluation_config_path) network_config_path = os.path.join(args.folder, "network.yml") network_config = NetworkConfig.load_from_yaml(network_config_path) reinforce_config_path = os.path.join(args.folder, "reinforce.yml") reinforce_config = ReinforceConfig.load_from_yaml(reinforce_config_path) if args.test: evaluation_config.training_episodes = 0 if args.collecting_experience: reinforce_config.collecting_experience = True map_name = args.map if map_name is None: print("You are traning the agent for the default map: ") print("TugOfWar") else: print("You are traning the agent for the map: ") print(map_name) #print(map_name) if args.eval: evaluation_config.training_episodes = 0 network_config.restore_network = True if args.render: evaluation_config.render = True if args.task == 'task_bigA': run_task_bigA(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) elif args.task == 'task_single_player': run_task(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) elif args.task == 'task_2p': run_task_2p(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) elif args.task == 'task_mbts': run_task_mbts(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) elif args.task == 'task_2p_2l': run_task_2p_2l(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) elif args.task == 'task_2p_2l_grid': run_task_2p_2l_grid(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever) elif args.task == 'task_2p_2l_hp': run_task_2p_2l_hp(evaluation_config, network_config, reinforce_config, map_name=map_name, train_forever=args.train_forever, agent_model=args.agent_model) elif args.task == 'task_2p_2l_deexplanation': run_task_2p_2l_deexplanation(evaluation_config, network_config, reinforce_config, map_name=map_name, agent_model=args.agent_model) elif args.task == 'task_2p_2l_grid_decomposed': run_task_2p_2l_grid_decomposed(evaluation_config, network_config, reinforce_config, map_name=map_name) elif args.task == 'task_2p_2l_grid_decomposed_trans': run_task_2p_2l_grid_decomposed_trans(evaluation_config, network_config, reinforce_config, map_name=map_name) elif args.task == 'task_mbts_grid': run_task_mbts_grid(evaluation_config, network_config, reinforce_config, map_name=map_name) else: print("need task") return 0
def run_task(evaluation_config, network_config, reinforce_config, map_name=None, train_forever=False): if (use_cuda): print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") else: print("~~~~~~~~~~~~~~~~~~~~~~~~~~") print("| NOT USING CUDA |") print("~~~~~~~~~~~~~~~~~~~~~~~~~~") flags.FLAGS(sys.argv[:1]) max_episode_steps = 40 replay_dimension = evaluation_config.xai_replay_dimension env = TugOfWar(map_name = map_name, \ generate_xai_replay = evaluation_config.generate_xai_replay, xai_replay_dimension = replay_dimension) reward_types = env.reward_types combine_sa = env.combine_sa state_1, state_2 = env.reset() if network_config.output_shape == 4: reward_num = 4 combine_decomposed_func = combine_decomposed_func_4 player_1_end_vector = player_1_end_vector_4 if network_config.output_shape == 8: reward_num = 8 combine_decomposed_func = combine_decomposed_func_8 player_1_end_vector = player_1_end_vector_8 if network_config.output_shape == 1: reward_num = 1 combine_decomposed_func = combine_decomposed_func_1 player_1_end_vector = player_1_end_vector_1 if not reinforce_config.is_random_agent_1: agent_1 = SADQAdaptive(name="TugOfWar", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) print("sadq agent 1") else: print("random agent 1") training_summaries_path = evaluation_config.summaries_path + "/train" clear_summary_path(training_summaries_path) train_summary_writer = SummaryWriter(training_summaries_path) test_summaries_path = evaluation_config.summaries_path + "/test" clear_summary_path(test_summaries_path) test_summary_writer = SummaryWriter(test_summaries_path) agents_2 = ["random", "random_2"] round_num = 0 privous_result = [] update_wins_waves = 10 all_experiences = [] path = './saved_models/tug_of_war/agents/grid' exp_save_path = 'abp/examples/pysc2/tug_of_war/rand_v_rand.pt' if reinforce_config.collecting_experience and not reinforce_config.is_random_agent_2: agent_1_model = "TugOfWar_eval.pupdate_240" exp_save_path = 'abp/examples/pysc2/tug_of_war/all_experiences.pt' for r, d, f in os.walk(path): for file in f: if '.p' in file: new_weights = torch.load(path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) if agent_1_model == file: print("********agent_1_model", file) agent_1.load_model(new_agent_2.eval_model) elif network_config.restore_network: restore_path = network_config.network_path for r, d, f in os.walk(restore_path): f = sorted(f) for file in f: if 'eval.pupdate' in file or 'eval.p_the_best' in file: new_weights = torch.load(restore_path + "/" + file) new_agent_2 = SADQAdaptive( name=file, state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_weight(new_weights) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) print("loaded agent:", file) # agent_1.steps = reinforce_config.epsilon_timesteps / 2 if evaluation_config.generate_xai_replay: agent_1_model = "TugOfWar_eval.pupdate_600" agent_2_model = "TugOfWar_eval.pupdate_560" agents_2 = [] if use_cuda: weights_1 = torch.load(path + "/" + agent_1_model) weights_2 = torch.load(path + "/" + agent_2_model) else: weights_1 = torch.load(path + "/" + agent_1_model, map_location=lambda storage, loc: storage) weights_2 = torch.load(path + "/" + agent_2_model, map_location=lambda storage, loc: storage) new_agent_2 = SADQAdaptive( name="record", state_length=len(state_1), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) agent_1.load_weight(weights_1) new_agent_2.load_weight(weights_2) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) if reinforce_config.is_use_sepcific_enemy: sepcific_SADQ_enemy_weights = torch.load(reinforce_config.enemy_path) sepcific_network_config = NetworkConfig.load_from_yaml( "./tasks/tug_of_war/sadq_2p_2l_decom/v2_8/network.yml") sepcific_network_config.restore_network = False sepcific_SADQ_enemy = SADQAdaptive( name="sepcific enemy", state_length=len(state_1), network_config=sepcific_network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=sepcific_network_config.output_shape, combine_decomposed_func=combine_decomposed_func_8) sepcific_SADQ_enemy.load_weight(sepcific_SADQ_enemy_weights) sepcific_SADQ_enemy.disable_learning(is_save=False) agents_2 = [sepcific_SADQ_enemy] while True: print(sum(np.array(privous_result) >= 0.9)) if len(privous_result) >= update_wins_waves and \ sum(np.array(privous_result) >= 0.9) >= update_wins_waves and \ not reinforce_config.is_random_agent_2 and not reinforce_config.is_use_sepcific_enemy: privous_result = [] print("replace enemy agent's weight with self agent") # random_enemy = False f = open(evaluation_config.result_path, "a+") f.write("Update agent\n") f.close() new_agent_2 = SADQAdaptive( name="TugOfWar_" + str(round_num), state_length=len(state_2), network_config=network_config, reinforce_config=reinforce_config, memory_resotre=False, reward_num=reward_num, combine_decomposed_func=combine_decomposed_func) new_agent_2.load_model(agent_1.eval_model) new_agent_2.disable_learning(is_save=False) agents_2.append(new_agent_2) agent_1.steps = reinforce_config.epsilon_timesteps / 2 agent_1.best_reward_mean = 0 agent_1.save(force=True, appendix="update_" + str(round_num)) round_num += 1 print( "=======================================================================" ) print( "===============================Now training============================" ) print( "=======================================================================" ) print("Now training.") print("Now have {} enemy".format(len(agents_2))) for idx_enemy, enemy_agent in enumerate(agents_2): # break if reinforce_config.collecting_experience: break if type(enemy_agent) == type("random"): print(enemy_agent) else: print(enemy_agent.name) if idx_enemy == len(agents_2) - 1: training_num = evaluation_config.training_episodes else: training_num = 10 for episode in tqdm(range(training_num)): # if type(enemy_agent) == type("random"): # break state_1, state_2 = env.reset() total_reward = 0 skiping = True done = False steps = 0 # print(list(state_1)) # print(list(state_2)) while skiping: state_1, state_2, done, dp = env.step([], 0) if dp or done: break last_mineral = state_1[env.miner_index] while not done and steps < max_episode_steps: steps += 1 # w += 1 # print(w) # Decision point # print('state:') # print("=======================================================================") # pretty_print(state_1, text = "state 1") # pretty_print(state_2, text = "state 2") if agent_1.steps < reinforce_config.epsilon_timesteps: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=1) else: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=0) actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=1) assert state_1[-1] == state_2[-1] == steps, print( state_1, state_2, steps) if not reinforce_config.is_random_agent_1: combine_states_1 = combine_sa(state_1, actions_1) # print(combine_states_1) # print(env.normalization(combine_states_1)) # print(state_1[env.miner_index]) choice_1, _ = agent_1.predict( env.normalization(combine_states_1)) # input() # for cs1 in combine_states_1: # print(cs1.tolist()) else: # combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if not reinforce_config.is_random_agent_2 and type( enemy_agent) != type("random"): combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: if enemy_agent == "random_2": actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) choice_2 = randint(0, len(actions_2) - 1) # print("action list:") # print(actions_1) # print(actions_2) # assign action # print("choice:") # print(actions_1[choice_1]) # print(actions_2[choice_2]) # pretty_print(combine_states_1[choice_1], text = "after state:") # input("pause") # print(combine_states_2[choice_2].tolist()) # if state_1[env.miner_index] > 300: # input('pause') env.step(list(actions_1[choice_1]), 1) env.step(list(actions_2[choice_2]), 2) # if steps == 39: # env.step([3,0,0,0,0,0,0], 1) last_mineral = combine_states_1[choice_1][env.miner_index] l_m_1 = state_1[env.miner_index] l_m_2 = state_2[env.miner_index] while skiping: state_1, state_2, done, dp = env.step([], 0) # input('time_step') if dp or done: break # Check if the mineral is correct # if not done and steps < max_episode_steps and type(enemy_agent) != type("random"): # next_mineral_1 = combine_states_1[choice_1][env.miner_index] + 100 + combine_states_1[choice_1][env.pylon_index] * 75 # # if type(enemy_agent) != type("random"): # next_mineral_2 = combine_states_2[choice_2][env.miner_index] + 100 + combine_states_2[choice_2][env.pylon_index] * 75 # if next_mineral_1 > 1500: # next_mineral_1 = 1500 # if next_mineral_2 > 1500: # next_mineral_2 = 1500 # print(next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1]) # # if type(enemy_agent) != type("random"): # print(next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2]) # assert next_mineral_1 == state_1[env.miner_index], print(l_m_1, next_mineral_1, state_1[env.miner_index], combine_states_1[choice_1], actions_1[choice_1]) # # if type(enemy_agent) != type("random"): # assert next_mineral_2 == state_2[env.miner_index], print(l_m_2, next_mineral_2, state_2[env.miner_index], combine_states_2[choice_2], actions_2[choice_2]) reward = [0] * reward_num if steps == max_episode_steps or done: reward = player_1_end_vector(state_1[63], state_1[64], state_1[65], state_1[66], is_done=done) # reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards) # print('reward:') # print(state_1[27], state_1[28], state_1[29], state_1[30]) # print(reward_1) # print(reward_2) # if steps == max_episode_steps or done: # input() if not reinforce_config.is_random_agent_1: agent_1.reward(reward) if not reinforce_config.is_random_agent_1: agent_1.end_episode(env.normalization(state_1)) # test_summary_writer.add_scalar(tag = "Train/Episode Reward", scalar_value = total_reward, # global_step = episode + 1) # train_summary_writer.add_scalar(tag = "Train/Steps to choosing Enemies", scalar_value = steps + 1, # global_step = episode + 1) if not reinforce_config.is_random_agent_1: agent_1.disable_learning( is_save=not reinforce_config.collecting_experience and not evaluation_config.generate_xai_replay) total_rewwards_list = [] # Test Episodes print( "======================================================================" ) print( "===============================Now testing============================" ) print( "======================================================================" ) tied_lose = 0 for idx_enemy, enemy_agent in enumerate(agents_2): average_end_state = np.zeros(len(state_1)) if type(enemy_agent) == type("random"): print(enemy_agent) else: print(enemy_agent.name) if idx_enemy == len( agents_2 ) - 1 and not reinforce_config.collecting_experience: test_num = evaluation_config.test_episodes else: test_num = 5 for episode in tqdm(range(test_num)): env.reset() total_reward_1 = 0 done = False skiping = True steps = 0 previous_state_1 = None previous_state_2 = None previous_action_1 = None previous_action_2 = None if evaluation_config.generate_xai_replay: recorder = XaiReplayRecorder2LaneNexus( env.sc2_env, episode, evaluation_config.env, action_component_names, replay_dimension) # print("%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%Starting episode%%%%%%%%%%%%%%%%%%%%%%%%%") # print(f"reinforce_config.collecting_experience {reinforce_config.collecting_experience}") while skiping: # print("about to call env.step() during skip") # start_time = time.time() state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) if dp or done: # print(time.time() - start_time) break # input(f"dp is {dp} done is {done}") # print("done stepping to finish prior action") while not done and steps < max_episode_steps: # input(f"not done and steps == {steps} < {max_episode_steps}") steps += 1 # # Decision point if not reinforce_config.is_random_agent_1: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index]) combine_states_1 = combine_sa(state_1, actions_1) choice_1, _ = agent_1.predict( env.normalization(combine_states_1)) else: actions_1 = env.get_big_A(state_1[env.miner_index], state_1[env.pylon_index], is_train=1) combine_states_1 = combine_sa(state_1, actions_1) choice_1 = randint(0, len(actions_1) - 1) if not reinforce_config.is_random_agent_2 and type( enemy_agent) != type("random"): actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index]) combine_states_2 = combine_sa(state_2, actions_2) choice_2, _ = enemy_agent.predict( env.normalization(combine_states_2)) else: if enemy_agent == "random_2": actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=0) else: actions_2 = env.get_big_A(state_2[env.miner_index], state_2[env.pylon_index], is_train=1) combine_states_2 = combine_sa(state_2, actions_2) choice_2 = randint(0, len(actions_2) - 1) # input("record dp if engaged") if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_decision_point( actions_1[choice_1], actions_2[choice_2], state_1, state_2, env.decomposed_reward_dict) # input('stepped with command 2') ####### #experience collecting ###### # input("collect experience if configured so") if reinforce_config.collecting_experience: if previous_state_1 is not None and previous_state_2 is not None and previous_action_1 is not None and previous_action_2 is not None: previous_state_1[8:14] = previous_state_2[ 1:7] # Include player 2's action previous_state_1[ env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 previous_state_1[-1] += 1 experience = [ previous_state_1, np.append(state_1, previous_reward_1) ] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0 ) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) previous_state_1 = deepcopy(combine_states_1[choice_1]) previous_state_2 = deepcopy(combine_states_2[choice_2]) previous_action_1 = deepcopy(actions_1[choice_1]) previous_action_2 = deepcopy(actions_2[choice_2]) # input(f"step p1 with {list(actions_1[choice_1])}") env.step(list(actions_1[choice_1]), 1) # input(f"step p2 with {list(actions_2[choice_2])}") env.step(list(actions_2[choice_2]), 2) # # human play # pretty_print(state_2, text = "state:") # env.step(list(get_human_action()), 2) # reinforce_config.collecting_experience = False while skiping: # print("Get actions time:") # start_time = time.time() # input("step to move the game along and send the wave") state_1, state_2, done, dp = env.step([], 0) if evaluation_config.generate_xai_replay: #recorder.save_jpg() recorder.record_game_clock_tick( env.decomposed_reward_dict) #input(' step wating for done signal') if dp or done: # print(time.time() - start_time) break reward = [0] * reward_num if steps == max_episode_steps or done: reward = player_1_end_vector(state_1[63], state_1[64], state_1[65], state_1[66], is_done=done) # input("separate rewards...") # reward_1, reward_2 = env.sperate_reward(env.decomposed_rewards) # print(env.decomposed_rewards) # print(reward_1, reward_2) # for r1 in reward_1: if reward_num == 4: current_reward_1 = sum(reward[2:]) elif reward_num == 8: current_reward_1 = reward[2] + reward[3] + reward[ 6] + reward[7] elif reward_num == 1: current_reward_1 = sum(reward) # print(current_reward_1) total_reward_1 += current_reward_1 # print(total_reward_1) # if total_reward_1 > 14000 or total_reward_1 < -14000: # input() previous_reward_1 = current_reward_1 # print("collect experience again if configured so") if reinforce_config.collecting_experience: previous_state_1[8:14] = previous_state_2[ 1:7] # Include player 2's action previous_state_1[env.miner_index] += previous_state_1[ env.pylon_index] * 75 + 100 previous_state_1[-1] += 1 experience = [ previous_state_1, np.append(state_1, previous_reward_1) ] all_experiences.append(experience) if ((len(all_experiences)) % 100 == 0) and reinforce_config.collecting_experience: torch.save(all_experiences, exp_save_path) average_end_state += state_1 total_rewwards_list.append(total_reward_1) test_summary_writer.add_scalar(tag="Test/Episode Reward", scalar_value=total_reward_1, global_step=episode + 1) test_summary_writer.add_scalar( tag="Test/Steps to choosing Enemies", scalar_value=steps + 1, global_step=episode + 1) # if reinforce_config.collecting_experience: # break #print(test.size()) # print(total_rewwards_list) # print("should be done with episode...") total_rewards_list_np = np.array(total_rewwards_list) tied = np.sum(total_rewards_list_np[-test_num:] == 0) wins = np.sum(total_rewards_list_np[-test_num:] > 0) lose = np.sum(total_rewards_list_np[-test_num:] <= 0) tied_lose += (tied + lose) print("wins/lose/tied") print( str(wins / test_num * 100) + "% \t", str(lose / test_num * 100) + "% \t", ) # str(tied / test_num * 100) + "% \t") pretty_print(average_end_state / test_num) tr = sum(total_rewwards_list) / len(total_rewwards_list) print("total reward:") print(tr) privous_result.append(tr) if len(privous_result) > update_wins_waves: del privous_result[0] f = open(evaluation_config.result_path, "a+") f.write(str(tr) + "\n") f.close() if tied_lose == 0 and not reinforce_config.is_random_agent_1: agent_1.save(force=True, appendix="_the_best") if not reinforce_config.is_random_agent_1: agent_1.enable_learning()