def train(num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV4, actions: DiscreteActionsV2, reward_funct): for ep in range(num_episodes): # print('<Training> Episode {}/{}:'.format(ep, num_episodes)) aux_positions_names = set() aux_actions_played = set() while game_interface.in_game(): # Update environment features: features.update_features(game_interface.get_state()) curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.act(curr_state_id) hfo_action: tuple = actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = game_interface.step(hfo_action, has_ball) reward = reward_funct(status) # Save metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward aux_positions_names.add(features.get_position_name()) action_name = actions.map_action_to_str(action_idx, has_ball) aux_actions_played.add(action_name) # Update environment features: prev_state_id = curr_state_id features.update_features(observation) curr_state_id = features.get_state_index() agent.store_ep(state_idx=prev_state_id, action_idx=action_idx, reward=reward, next_state_idx=curr_state_id, has_ball=has_ball, done=not game_interface.in_game()) agent.learn() # print(':: Episode: {}; reward: {}; epsilon: {}; positions: {}; ' # 'actions: {}'.format(ep, agent.cum_reward, agent.epsilon, # aux_positions_names, aux_actions_played)) agent.save_metrics(agent.old_q_table, agent.q_table) # Reset player: agent.reset() agent.update_hyper_parameters(episode=ep, num_total_episodes=num_episodes) # Game Reset game_interface.reset() agent.save_model() actions_name = [actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions)] agent.export_metrics(training=True, actions_name=actions_name)
def test(num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV5, actions: DiscreteActionsV5, reward_funct): """ @param num_episodes: number of episodes to run @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (int) the avarage reward """ # Run training using Q-Learning sum_score = 0 for ep in range(num_episodes): print('<Test> {}/{}:'.format(ep, num_episodes)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Test loop: while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.exploit_actions(curr_state_id) hfo_action_params, num_rep = \ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) action_name = actions.map_action_to_str(action_idx, has_ball) # Step: rep_counter_aux = 0 while game_interface.in_game() and rep_counter_aux < num_rep: status, observation = game_interface.step( hfo_action_params, has_ball) rep_counter_aux += 1 reward = reward_funct(status) # update features: features.update_features(observation) # Save metrics: agent.save_visited_state(curr_state_id, action_idx) sum_score += reward # Reset player: agent.reset(training=False) # Game Reset game_interface.reset() return sum_score / num_episodes
def test(train_ep: int, num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV4, actions: DiscreteActionsV2, reward_funct): # Run training using Q-Learning score = 0 agent.test_episodes.append(train_ep) for ep in range(num_episodes): print('<Test> {}/{}:'.format(ep, num_episodes)) prev_state_id =-1 while game_interface.in_game(): # Update environment features: features.update_features(game_interface.get_state()) curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: if prev_state_id != curr_state_id: print([round(val, 2) for val in agent.q_table[curr_state_id]]) action_idx = agent.exploit_actions(curr_state_id) hfo_action: tuple = actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = game_interface.step(hfo_action, has_ball) prev_state_id = curr_state_id # Save Metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward_funct(status) print(':: Episode: {}; reward: {}'.format(ep, agent.cum_reward)) score += 1 if game_interface.status == GOAL else 0 # Reset player: agent.reset(training=False) # Game Reset game_interface.reset() agent.scores.append(score) actions_name = [actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions)] agent.export_metrics(training=False, actions_name=actions_name)
def go_to_origin_position(game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, actions: DiscreteActionsV5, random_start: bool = True): if random_start: pos_name, origin_pos = random.choice(list(ORIGIN_POSITIONS.items())) else: pos_name = "Fixed start" origin_pos = features.get_pos_tuple() print("\nMoving to starting point: {0}".format(pos_name)) pos = features.get_pos_tuple(round_ndigits=1) while origin_pos != pos: has_ball = features.has_ball() hfo_action: tuple = actions.dribble_to_pos(origin_pos) status, observation = game_interface.step(hfo_action, has_ball) features.update_features(observation) pos = features.get_pos_tuple(round_ndigits=1)
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV5, actions: DiscreteActionsV5, save_metrics: bool, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param save_metrics: flag, if true save the metrics; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ for ep in range(num_train_episodes): # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop aux_positions_names = set() aux_actions_played = set() while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: rep_counter_aux = 0 while game_interface.in_game() and rep_counter_aux < num_rep: status, observation = game_interface.step( hfo_action_params, has_ball) rep_counter_aux += 1 reward = reward_funct(status) # Save metrics: if save_metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward aux_positions_names.add(features.get_position_name()) action_name = actions.map_action_to_str(action_idx, has_ball) aux_actions_played.add(action_name) # Update environment features: prev_state_id = curr_state_id features.update_features(observation) curr_state_id = features.get_state_index() agent.store_ep(state_idx=prev_state_id, action_idx=action_idx, reward=reward, next_state_idx=curr_state_id, has_ball=has_ball, done=not game_interface.in_game()) agent.learn() # print(':: Episode: {}; reward: {}; epsilon: {}; positions: {}; ' # 'actions: {}'.format(ep, agent.cum_reward, agent.epsilon, # aux_positions_names, aux_actions_played)) if save_metrics: agent.save_metrics(agent.old_q_table, agent.q_table) # Reset player: agent.reset() agent.update_hyper_parameters(episode=agent.train_eps, num_total_episodes=num_total_train_ep) # Game Reset game_interface.reset() agent.save_model() if save_metrics: actions_name = [ actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions) ] agent.export_metrics(training=True, actions_name=actions_name) return agent
parser = argparse.ArgumentParser() parser.add_argument('--port', type=int, default=6000) args = parser.parse_args() port = args.port hfo = HFOEnvironment() hfo.connectToServer(feature_set=HIGH_LEVEL_FEATURE_SET, server_port=port, config_dir=CONFIG_DIR) for i in range(1): status = IN_GAME score = 0 NUM_TEAMMATES = 0 NUM_OPPONENTS = 1 observation = hfo.getState() env = DiscreteFeaturesV2(num_team=NUM_TEAMMATES, num_op=NUM_OPPONENTS) went_to_the_corner = False ep = 0 print("NEW GAME:") for i in range(4): print("New game:") print("Status: ",hfo.step()) status = IN_GAME while status == IN_GAME: print("waiting observation") observation = hfo.getState() env.update_features(observation) pos_tuple = env.get_pos_tuple() print("waiting action") if ep < 10: hfo.act(DRIBBLE_TO, -0.7, 0)
def setUpClass(cls) -> None: super(TestHighLevelEnvironment, cls).setUpClass() cls.features_manager = DiscreteFeaturesV2(0, 1)
def test(num_episodes: int, game_interface: HFOAttackingPlayer, features: DiscreteFeaturesV2, agent: QLearningAgentV6, actions: DiscreteActionsV5, reward_funct) -> float: """ @param num_episodes: number of episodes to run @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (float) the average reward """ # Run training using Q-Learning sum_score = 0 for ep in range(num_episodes): # Check if server still up: if game_interface.hfo.step() == SERVER_DOWN: print("Server is down while testing; episode={}".format(ep)) break # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Test loop: debug_counter = 0 # TODO remove while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: debug_counter += 1 action_idx = agent.exploit_actions(curr_state_id) hfo_action_params, num_rep = \ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # update features: reward = reward_funct(status) features.update_features(observation) sum_score += reward if status == OUT_OF_TIME: if debug_counter < 5: raise NoActionPlayedError( "agent was only able to choose {}".format(debug_counter)) # Game Reset game_interface.reset() print("<<TEST>> AVR reward = ", sum_score / (ep + 1)) return sum_score / num_episodes
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: DiscreteFeaturesV2, agent: QLearningAgentV6, actions: DiscreteActionsV5, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ sum_score = 0 sum_epsilons = 0 agent.counter_explorations = 0 agent.counter_exploitations = 0 for ep in range(num_train_episodes): # Check if server still up: # if game_interface.hfo.step() == SERVER_DOWN: # raise ServerDownError("training; episode={}".format(ep)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop debug_counter = 0 # TODO remove while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: debug_counter += 1 action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # Update environment features: reward = reward_funct(status) sum_score += reward features.update_features(observation) new_state_id = features.get_state_index() agent.store_ep(state_idx=curr_state_id, action_idx=action_idx, reward=reward, next_state_idx=new_state_id, has_ball=has_ball, done=not game_interface.in_game()) if status == OUT_OF_TIME: if debug_counter < 5: raise NoActionPlayedError( "agent was only able to choose {}".format(debug_counter)) agent.learn_buffer() agent.update_hyper_parameters(num_total_episodes=num_total_train_ep) sum_epsilons += agent.epsilon # Game Reset game_interface.reset() print("<<TRAIN>> AVR reward = ", sum_score / num_train_episodes) print("<<TRAIN>> %Explorations={}% ".format( round( (agent.counter_explorations / (agent.counter_exploitations + agent.counter_explorations)), 4) * 100))
num_episodes = (num_train_ep + num_test_ep) * num_repetitions # Directory save_dir = args.save_dir or mkdir( num_episodes, num_op, extra_note="oldEps") print("Starting Training - id={}; num_opponents={}; num_teammates={}; " "num_episodes={};".format(agent_id, num_op, num_team, num_episodes)) # Initialize connection with the HFO server hfo_interface = HFOAttackingPlayer(agent_id=agent_id, num_opponents=num_op, num_teammates=num_team) hfo_interface.connect_to_server() # Agent set-up reward_function = basic_reward features_manager = DiscreteFeaturesV2(num_team, num_op) actions_manager = DiscreteActionsV5() agent = QLearningAgentV6(num_states=features_manager.get_num_states(), num_actions=actions_manager.get_num_actions(), learning_rate=0.1, discount_factor=0.9, epsilon=0.8) # Test one first time without previous train: av_reward = test(num_episodes=num_test_ep, agent=agent, game_interface=hfo_interface, features=features_manager, actions=actions_manager, reward_funct=reward_function) # Save metrics structures