def train(num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV4, actions: DiscreteActionsV2, reward_funct): for ep in range(num_episodes): # print('<Training> Episode {}/{}:'.format(ep, num_episodes)) aux_positions_names = set() aux_actions_played = set() while game_interface.in_game(): # Update environment features: features.update_features(game_interface.get_state()) curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.act(curr_state_id) hfo_action: tuple = actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = game_interface.step(hfo_action, has_ball) reward = reward_funct(status) # Save metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward aux_positions_names.add(features.get_position_name()) action_name = actions.map_action_to_str(action_idx, has_ball) aux_actions_played.add(action_name) # Update environment features: prev_state_id = curr_state_id features.update_features(observation) curr_state_id = features.get_state_index() agent.store_ep(state_idx=prev_state_id, action_idx=action_idx, reward=reward, next_state_idx=curr_state_id, has_ball=has_ball, done=not game_interface.in_game()) agent.learn() # print(':: Episode: {}; reward: {}; epsilon: {}; positions: {}; ' # 'actions: {}'.format(ep, agent.cum_reward, agent.epsilon, # aux_positions_names, aux_actions_played)) agent.save_metrics(agent.old_q_table, agent.q_table) # Reset player: agent.reset() agent.update_hyper_parameters(episode=ep, num_total_episodes=num_episodes) # Game Reset game_interface.reset() agent.save_model() actions_name = [actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions)] agent.export_metrics(training=True, actions_name=actions_name)
def test(num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV5, actions: DiscreteActionsV5, reward_funct): """ @param num_episodes: number of episodes to run @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (int) the avarage reward """ # Run training using Q-Learning sum_score = 0 for ep in range(num_episodes): print('<Test> {}/{}:'.format(ep, num_episodes)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Test loop: while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.exploit_actions(curr_state_id) hfo_action_params, num_rep = \ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) action_name = actions.map_action_to_str(action_idx, has_ball) # Step: rep_counter_aux = 0 while game_interface.in_game() and rep_counter_aux < num_rep: status, observation = game_interface.step( hfo_action_params, has_ball) rep_counter_aux += 1 reward = reward_funct(status) # update features: features.update_features(observation) # Save metrics: agent.save_visited_state(curr_state_id, action_idx) sum_score += reward # Reset player: agent.reset(training=False) # Game Reset game_interface.reset() return sum_score / num_episodes
def test(train_ep: int, num_episodes: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV4, actions: DiscreteActionsV2, reward_funct): # Run training using Q-Learning score = 0 agent.test_episodes.append(train_ep) for ep in range(num_episodes): print('<Test> {}/{}:'.format(ep, num_episodes)) prev_state_id =-1 while game_interface.in_game(): # Update environment features: features.update_features(game_interface.get_state()) curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: if prev_state_id != curr_state_id: print([round(val, 2) for val in agent.q_table[curr_state_id]]) action_idx = agent.exploit_actions(curr_state_id) hfo_action: tuple = actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = game_interface.step(hfo_action, has_ball) prev_state_id = curr_state_id # Save Metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward_funct(status) print(':: Episode: {}; reward: {}'.format(ep, agent.cum_reward)) score += 1 if game_interface.status == GOAL else 0 # Reset player: agent.reset(training=False) # Game Reset game_interface.reset() agent.scores.append(score) actions_name = [actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions)] agent.export_metrics(training=False, actions_name=actions_name)
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: discrete_features_v2.DiscreteFeaturesV2, agent: QLearningAgentV5, actions: DiscreteActionsV5, save_metrics: bool, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param save_metrics: flag, if true save the metrics; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ for ep in range(num_train_episodes): # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop aux_positions_names = set() aux_actions_played = set() while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: rep_counter_aux = 0 while game_interface.in_game() and rep_counter_aux < num_rep: status, observation = game_interface.step( hfo_action_params, has_ball) rep_counter_aux += 1 reward = reward_funct(status) # Save metrics: if save_metrics: agent.save_visited_state(curr_state_id, action_idx) agent.cum_reward += reward aux_positions_names.add(features.get_position_name()) action_name = actions.map_action_to_str(action_idx, has_ball) aux_actions_played.add(action_name) # Update environment features: prev_state_id = curr_state_id features.update_features(observation) curr_state_id = features.get_state_index() agent.store_ep(state_idx=prev_state_id, action_idx=action_idx, reward=reward, next_state_idx=curr_state_id, has_ball=has_ball, done=not game_interface.in_game()) agent.learn() # print(':: Episode: {}; reward: {}; epsilon: {}; positions: {}; ' # 'actions: {}'.format(ep, agent.cum_reward, agent.epsilon, # aux_positions_names, aux_actions_played)) if save_metrics: agent.save_metrics(agent.old_q_table, agent.q_table) # Reset player: agent.reset() agent.update_hyper_parameters(episode=agent.train_eps, num_total_episodes=num_total_train_ep) # Game Reset game_interface.reset() agent.save_model() if save_metrics: actions_name = [ actions_manager.map_action_to_str(i, has_ball=True) for i in range(agent.num_actions) ] agent.export_metrics(training=True, actions_name=actions_name) return agent
def test(num_episodes: int, game_interface: HFOAttackingPlayer, features: DiscreteFeaturesV2, agent: QLearningAgentV6, actions: DiscreteActionsV5, reward_funct) -> float: """ @param num_episodes: number of episodes to run @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (float) the average reward """ # Run training using Q-Learning sum_score = 0 for ep in range(num_episodes): # Check if server still up: if game_interface.hfo.step() == SERVER_DOWN: print("Server is down while testing; episode={}".format(ep)) break # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Test loop: debug_counter = 0 # TODO remove while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: debug_counter += 1 action_idx = agent.exploit_actions(curr_state_id) hfo_action_params, num_rep = \ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # update features: reward = reward_funct(status) features.update_features(observation) sum_score += reward if status == OUT_OF_TIME: if debug_counter < 5: raise NoActionPlayedError( "agent was only able to choose {}".format(debug_counter)) # Game Reset game_interface.reset() print("<<TEST>> AVR reward = ", sum_score / (ep + 1)) return sum_score / num_episodes
def train(num_train_episodes: int, num_total_train_ep: int, game_interface: HFOAttackingPlayer, features: DiscreteFeaturesV2, agent: QLearningAgentV6, actions: DiscreteActionsV5, reward_funct): """ @param num_train_episodes: number of episodes to train in this iteration @param num_total_train_ep: number total of episodes to train @param game_interface: game interface, that manages interactions between both; @param features: features interface, from the observation array, gets the main features for the agent; @param agent: learning agent; @param actions: actions interface; @param reward_funct: reward function used @return: (QLearningAgentV5) the agent """ sum_score = 0 sum_epsilons = 0 agent.counter_explorations = 0 agent.counter_exploitations = 0 for ep in range(num_train_episodes): # Check if server still up: # if game_interface.hfo.step() == SERVER_DOWN: # raise ServerDownError("training; episode={}".format(ep)) # Go to origin position: features.update_features(game_interface.get_state()) go_to_origin_position(game_interface=game_interface, features=features, actions=actions) # Start learning loop debug_counter = 0 # TODO remove while game_interface.in_game(): # Update environment features: curr_state_id = features.get_state_index() has_ball = features.has_ball() # Act: debug_counter += 1 action_idx = agent.act(curr_state_id) hfo_action_params, num_rep =\ actions.map_action_idx_to_hfo_action( agent_pos=features.get_pos_tuple(), has_ball=has_ball, action_idx=action_idx) # Step: status, observation = execute_action( action_params=hfo_action_params, repetitions=num_rep, has_ball=has_ball, game_interface=game_interface) # Update environment features: reward = reward_funct(status) sum_score += reward features.update_features(observation) new_state_id = features.get_state_index() agent.store_ep(state_idx=curr_state_id, action_idx=action_idx, reward=reward, next_state_idx=new_state_id, has_ball=has_ball, done=not game_interface.in_game()) if status == OUT_OF_TIME: if debug_counter < 5: raise NoActionPlayedError( "agent was only able to choose {}".format(debug_counter)) agent.learn_buffer() agent.update_hyper_parameters(num_total_episodes=num_total_train_ep) sum_epsilons += agent.epsilon # Game Reset game_interface.reset() print("<<TRAIN>> AVR reward = ", sum_score / num_train_episodes) print("<<TRAIN>> %Explorations={}% ".format( round( (agent.counter_explorations / (agent.counter_exploitations + agent.counter_explorations)), 4) * 100))