def get_action_costs(): # Create a new environment, read the config and record the action costs env = make_environment('halite') action_costs = np.zeros((len(ACTION_MAPPING))) for k in ACTION_MAPPING: if ACTION_MAPPING[k] == CONVERT: action_costs[k] = env.configuration.convertCost elif ACTION_MAPPING[k] == SPAWN: action_costs[k] = env.configuration.spawnCost else: action_costs[k] = 0 return action_costs
def get_input_output_shapes(config): # Create a new environment, perform the preprocessing and record the shape env = make_environment('halite') env.reset(num_agents=config['num_agents_per_game']) env_configuration = env.configuration env_observation = env.state[0].observation obs_input = state_to_input(structured_env_obs(env_configuration, env_observation, active_id=0), num_mirror_dim=config['num_mirror_dim']) num_actions = len(ACTION_MAPPING) return obs_input.shape, num_actions, config['num_q_functions']
def record_videos(agent_path, num_agents_per_game, rng_action_seeds, extension_override=None, config_override_agents=None, deterministic_games=False, env_seed_deterministic=0, deterministic_extension=None, first_game_recording=None): print("Generating videos of iteration {}".format(agent_path)) env_configuration = {"agentExec": "LOCAL"} if deterministic_games: env_configuration["randomSeed"] = env_seed_deterministic env = make_environment( "halite", configuration=env_configuration ) #, configuration={"agentTimeout": 10000, "actTimeout": 10000}) config = load_configs([agent_path])[0] env_configuration = env.configuration def my_agent(observation, config_id): config = AGENT_CONFIGS[config_id] rng_action_seed = rng_action_seeds[config_id] active_id = observation.player current_observation = utils.structured_env_obs(env_configuration, observation, active_id) player_obs = observation.players[active_id] mapped_actions, _, _ = get_config_or_callable_actions( config, current_observation, player_obs, observation, env_configuration, rng_action_seed) return mapped_actions if config_override_agents is None: AGENT_CONFIGS = [] for i in range(num_agents_per_game): AGENT_CONFIGS.append(sample_from_config(config)) else: AGENT_CONFIGS = [ sample_from_config_or_path(p, return_callable=True) for p in config_override_agents ] # For some reason this needs to be verbose - list comprehension breaks the # stochasticity of the agents. config_id_agents = [ lambda observation: my_agent(observation, 0), lambda observation: my_agent(observation, 1), lambda observation: my_agent(observation, 2), lambda observation: my_agent(observation, 3), ][:num_agents_per_game] for video_type in ["self play", "random opponent"][:1]: original_video_type = video_type if config_override_agents is not None and video_type == "self play": video_type = "; ".join([ a.rsplit('/', 1)[-1][:-3] for a in config_override_agents[1:] ]) if original_video_type == "self play" and first_game_recording is not None: game_recording = first_game_recording else: env.reset(num_agents=num_agents_per_game) agents = config_id_agents if original_video_type == "self play" else [ config_id_agents[0] ] + ["random"] * (num_agents_per_game - 1) env.run(agents) game_recording = env.render(mode="html", width=800, height=600) # Save the HTML recording in the videos folder folder, extension = tuple(agent_path.rsplit('/', 1)) videos_folder = os.path.join(folder, 'Videos') Path(videos_folder).mkdir(parents=True, exist_ok=True) ext = extension[:-5] if extension_override is None else extension_override ext += deterministic_extension if deterministic_games else '' video_path = os.path.join(videos_folder, ext + ' - ' + video_type + '.html') with open(video_path, "w") as f: f.write(game_recording)
for i in range(num_agents): agent_path = agent_full_paths[i] agent_paths = [agent_path, agent_path, agent_path, agent_path] video_name = agent_extensions[i][:-3] + " ***self play***" agents_paths_video_names.append((agent_paths, video_name)) if agents_paths_video_names: # Load all agent callables once (not really that much more performant) agent_callables = {} for i in range(num_agents): agent_path = agent_full_paths[i] agent_file = environment_utils.read_file(agent_path) agent_callables[agent_path] = environment_utils.get_last_callable( agent_file) env = make_environment("halite", configuration={"agentExec": "LOCAL"}) for agent_paths, video_name in agents_paths_video_names: agents = [] for p in agent_paths: agents.append(agent_callables[p]) env.reset(num_agents=len(agents)) env.run(agents) # Save the HTML recording in the videos folder game_recording = env.render(mode="html", width=800, height=600) videos_folder = os.path.join(agents_folder, '../Videos') Path(videos_folder).mkdir(parents=True, exist_ok=True) video_path = os.path.join(videos_folder, video_name+'.html') with open(video_path,"w") as f: f.write(game_recording)
def collect_experience_single_game(game_agent_paths, game_agents, num_agents, verbose, game_id, env_random_seed, act_random_seeds, record_game, episode_steps_override, early_episode_termination, rule_actions_id): episode_start_time = time.time() # Generate reproducible data for better debugging utils.set_seed(env_random_seed) game_agents = [ a if isinstance(a, dict) else (kaggle_agent.get_last_callable(a)) for a in game_agents ] config_game_agents = [ a if isinstance(a, dict) else "text_agent" for a in (game_agents) ] # Add option to shuffle the location of the main agent - for now this serves # for testing the stateful history logic. first_rule_agent = game_agents.pop(0) game_agents.insert(rule_actions_id, first_rule_agent) env_config = {"randomSeed": env_random_seed} if episode_steps_override is not None: env_config["episodeSteps"] = episode_steps_override env = make_environment('halite', configuration=env_config) env.reset(num_agents=num_agents) max_episode_steps = env.configuration.episodeSteps if early_episode_termination is not None: max_episode_steps = min(max_episode_steps, early_episode_termination) halite_scores = np.full((max_episode_steps, num_agents), np.nan) action_delays = np.full((max_episode_steps - 1, num_agents), np.nan) first_get_actions_durations = np.full(max_episode_steps - 1, np.nan) first_box_in_durations = np.full(max_episode_steps - 1, np.nan) first_history_durations = np.full(max_episode_steps - 1, np.nan) first_ship_scores_durations = np.full(max_episode_steps - 1, np.nan) first_ship_plans_durations = np.full(max_episode_steps - 1, np.nan) first_ship_map_durations = np.full(max_episode_steps - 1, np.nan) halite_scores[0] = env.state[0].observation.players[0][0] total_halite_spent = np.zeros(num_agents).tolist() initial_obs = utils.structured_env_obs(env.configuration, env.state[0].observation, 0) initial_halite_setup = initial_obs['halite'] initial_agents_setup = np.zeros_like(initial_halite_setup) for i, (_, _, ships, _) in enumerate(initial_obs['rewards_bases_ships']): initial_agents_setup = initial_agents_setup + (i + 1) * ships # Take actions until the game is terminated episode_step = 0 num_lost_ships = np.zeros((max_episode_steps - 1, num_agents), dtype=np.int) first_agent_step_details = [] first_agent_ship_counts = np.zeros(max_episode_steps - 1) ship_counts = np.full((max_episode_steps - 1, num_agents), np.nan) histories = [{} for i in range(num_agents)] while not env.done: env_observation = env.state[0].observation player_mapped_actions = [] for active_id in range(num_agents): agent_status = env.state[active_id].status players = env.state[0].observation.players if agent_status == 'ACTIVE': current_observation = utils.structured_env_obs( env.configuration, env_observation, active_id) player_obs = players[active_id] env_observation.player = active_id step_start_time = time.time() mapped_actions, updated_history, halite_spent, step_details = ( rule_utils.get_config_or_callable_actions( game_agents[active_id], current_observation, player_obs, env_observation, env.configuration, histories[active_id], act_random_seeds[active_id])) histories[active_id] = updated_history ship_counts[current_observation['step'], active_id] = len(player_obs[2]) if active_id == rule_actions_id: first_agent_step_details.append(step_details) first_get_actions_durations[episode_step] = step_details[ 'get_actions_duration'] first_box_in_durations[episode_step] = step_details[ 'box_in_duration'] first_history_durations[episode_step] = step_details[ 'history_start_duration'] first_ship_scores_durations[episode_step] = step_details[ 'ship_scores_duration'] first_ship_plans_durations[episode_step] = step_details[ 'ship_plans_duration'] first_ship_map_durations[episode_step] = step_details[ 'ship_map_duration'] first_agent_ship_counts[current_observation['step']] = len( player_obs[2]) step_delay = time.time() - step_start_time action_delays[episode_step, active_id] = step_delay total_halite_spent[active_id] += halite_spent if verbose: print("Player {} obs: {}".format(active_id, player_obs)) print("Actions: {}\n".format(mapped_actions)) player_mapped_actions.append(mapped_actions) else: player_mapped_actions.append({}) env.step(player_mapped_actions) for i in range(num_agents): agent_status = env.state[i].status halite_score = -1 if agent_status in [ 'INVALID', 'DONE' ] else env.state[0].observation.players[i][0] halite_scores[episode_step + 1, i] = halite_score ordered_current_observation = utils.structured_env_obs( env.configuration, env_observation, 0) num_lost_ships[episode_step] = get_lost_ships_count( player_mapped_actions, players, env.state[0].observation.players, ordered_current_observation, verbose_id=rule_actions_id + 0.5) episode_step += 1 if early_episode_termination is not None and ( episode_step >= (early_episode_termination - 1)): break # Write the terminal halite scores halite_scores = update_terminal_halite_scores(num_agents, halite_scores, episode_step, max_episode_steps, env) # Evaluate why the game evolved as it did # import pdb; pdb.set_trace() action_override_counts = np.array([ first_agent_step_details[i]['action_overrides'] for i in range(len(first_agent_step_details)) ]) print("Action override counts:", action_override_counts.sum(0)) print("Num lost ships:", num_lost_ships.sum(0)) # Obtain the terminal rewards for all agents episode_rewards = get_episode_rewards(halite_scores) # Obtain the terminal number of ships and bases for all agents terminal_num_bases, terminal_num_ships = get_base_and_ship_counts(env) terminal_halite = halite_scores[-1].tolist() print("Terminal halite:", terminal_halite) # Generate the episode recording if requested if record_game: game_recording = env.render(mode="html", width=800, height=600) else: game_recording = None # Combine the different first player durations into a matrix for better # analysis all_first_durations = np.stack([ action_delays[:, rule_actions_id], first_get_actions_durations, first_box_in_durations, first_history_durations, first_ship_scores_durations, first_ship_plans_durations, first_ship_map_durations, ], -1) # Store the game data this_game_data = ExperienceGame( game_id, config_game_agents, game_agent_paths, initial_halite_setup, initial_agents_setup, halite_scores, all_first_durations, action_delays, first_get_actions_durations, first_box_in_durations, first_history_durations, first_ship_scores_durations, first_ship_plans_durations, first_ship_map_durations, episode_step, episode_rewards, terminal_num_bases, terminal_num_ships, terminal_halite, total_halite_spent, None, # Opponent names added outside of this function env_random_seed, act_random_seeds, # first_agent_step_details, game_recording, num_lost_ships, ) episode_duration = time.time() - episode_start_time return (this_game_data, episode_duration)
def record_videos(agent_path, num_agents_per_game, num_mirror_dim, extension_override=None): print("Generating videos of iteration {}".format(agent_path)) model = load_models([agent_path])[0] action_costs = get_action_costs() def my_agent(observation, env_configuration): active_id = observation.player current_observation = structured_env_obs(env_configuration, observation, active_id) player_obs = observation.players[active_id] # Preprocess the state so it can be fed in to the network obs_input = np.expand_dims( state_to_input(current_observation, num_mirror_dim=num_mirror_dim), 0) # Obtain the q values q_values = model(obs_input).numpy()[0] # Determine valid actions for each of the ships/shipyards all_key_q_valid = get_key_q_valid( q_values, player_obs, env_configuration, current_observation['rewards_bases_ships']) mapped_actions = {} action_budget = player_obs[0] for i, (k, q_sub_values, valid_sub_actions, r, c, _) in enumerate(all_key_q_valid): # Set actions we can't afford to invalid valid_sub_actions &= action_costs <= action_budget valid_sub_actions = np.where(valid_sub_actions)[0] best_q = q_sub_values[valid_sub_actions].max() best_a_id = np.where( q_sub_values[valid_sub_actions] == best_q)[0][0] action_id = valid_sub_actions[best_a_id] # Hard coded epsilon greedy exploration if np.random.uniform() < 0.05: action_id = np.random.choice(valid_sub_actions) action_budget -= action_costs[action_id] mapped_action = ACTION_MAPPING[action_id] if mapped_action == GO_NEAREST_BASE: mapped_action = get_direction_nearest_base( player_obs, r, c, env_configuration.size) if not mapped_action in [SHIP_NONE, BASE_NONE]: mapped_actions[k] = mapped_action return mapped_actions env = make_environment("halite", configuration={ "agentExec": "LOCAL" }) #, configuration={"agentTimeout": 10000, "actTimeout": 10000}) for video_type in ["random opponent", "self play"]: env.reset(num_agents=num_agents_per_game) agents = [my_agent ] * num_agents_per_game if video_type == "self play" else [ my_agent ] + ["random"] * (num_agents_per_game - 1) env.run(agents) # Save the HTML recording in the videos folder game_recording = env.render(mode="html", width=800, height=600) folder, extension = tuple(agent_path.rsplit('/', 1)) videos_folder = os.path.join(folder, 'Videos') Path(videos_folder).mkdir(parents=True, exist_ok=True) ext = extension[:-3] if extension_override is None else extension_override video_path = os.path.join(videos_folder, ext + ' - ' + video_type + '.html') with open(video_path, "w") as f: f.write(game_recording)
def collect_experience_single_game(this_agent, other_agents, num_agents, agent_config, action_costs, verbose, game_id): episode_start_time = time.time() game_agents, this_agent_position, opponent_id = get_game_agents( this_agent, other_agents, num_agents) this_game_data = [] env = make_environment('halite') env.reset(num_agents=num_agents) exploration_parameter, max_exploration_parameter = ( get_exploration_parameter(agent_config)) max_episode_steps = env.configuration.episodeSteps halite_scores = np.full((max_episode_steps, num_agents), np.nan) halite_scores[0] = env.state[0].observation.players[0][0] episode_step = 0 # Take actions until the game is terminated while not env.done: env_observation = env.state[0].observation player_current_observations = [] player_current_obs = [] player_env_obs = [] player_network_outputs = [] player_actions = [] player_mapped_actions = [] player_valid_actions = [] store_transition_ids = [] for active_id in range(num_agents): agent_status = env.state[active_id].status if agent_status == 'ACTIVE': store_transition_ids.append(active_id) current_observation = utils.structured_env_obs( env.configuration, env_observation, active_id) player_obs = env.state[0].observation.players[active_id] (current_obs, network_outputs, actions, mapped_actions, valid_actions) = utils.get_agent_q_and_a( game_agents[active_id], current_observation, player_obs, env.configuration, agent_config['epsilon_greedy'], exploration_parameter, agent_config['num_mirror_dim'], action_costs, pick_first_on_tie=False) if verbose: print("Player {} obs: {}".format(active_id, player_obs)) print("Actions: {}\n".format(mapped_actions)) player_current_observations.append(current_observation) player_current_obs.append(current_obs[0][0]) player_env_obs.append(player_obs) player_network_outputs.append(network_outputs) player_actions.append(actions) player_mapped_actions.append(mapped_actions) player_valid_actions.append(valid_actions) else: if agent_status != 'INVALID': raise ValueError( "Unexpected agent state: {}".format(agent_status)) player_mapped_actions.append({}) if verbose: print("Step: {}; Max halite: {}".format( episode_step, current_observation['halite'].max())) env.step(player_mapped_actions) env_observation = env.state[0].observation # Store the state transition data for i, active_id in enumerate(store_transition_ids): next_observation = utils.structured_env_obs( env.configuration, env_observation, active_id) # next_halite = next_observation['rewards_bases_ships'][0][0] # next_obs = utils.state_to_input(next_observation) agent_status = env.state[active_id].status next_halite = env.state[0].observation.players[active_id][0] # if next_halite-halite_scores[episode_step, active_id] < -5000: # import pdb; pdb.set_trace() # Overwrite selected actions to None if the environment did not execute # the requested action. player_obs = env.state[0].observation.players[active_id] player_actions[i] = set_ignored_actions_to_None( player_actions[i], player_mapped_actions[active_id], player_env_obs[i], player_obs, player_current_observations[i], next_observation) this_game_data.append( ExperienceStep( game_id, player_current_obs[i], player_actions[i], player_mapped_actions[active_id], player_valid_actions[i], player_network_outputs[i], # next_obs, # Dropped out of memory concerns - useful for debugging active_id == this_agent_position, # This agent move? active_id, episode_step, next_halite, next_halite - halite_scores[episode_step, active_id], np. nan, # Number of episode steps, overwritten at the end of episode agent_status == 'INVALID', # Last episode action np.nan, # Reward, overwritten at the end of the episode )) for i in range(num_agents): agent_status = env.state[i].status halite_score = -1 if agent_status == 'INVALID' else env.state[ 0].observation.players[i][0] halite_scores[episode_step + 1, i] = halite_score episode_step += 1 # Obtain the terminal rewards for all agents halite_scores = halite_scores[:episode_step] episode_rewards = get_episode_rewards(halite_scores) # Update statistics which can not be computed before the episode is over. for i in range(len(store_transition_ids)): this_game_data[-1 - i].last_episode_action = True # Last episode action for i in range(len(this_game_data)): this_game_data[i].num_episode_steps = episode_step episode_duration = time.time() - episode_start_time return (this_game_data, episode_rewards, opponent_id, this_agent_position, episode_duration)