def __init__(self, **kwargs): # batch_size=None, # Unpack arguments from sacred args = kwargs.get('env_args', kwargs) if isinstance(args, dict): args = convert(args) # Define the agents self.n_agents = 2 self.episode_limit = args.steps # print(args) if getattr(args, 'state_type', None): print('args.state_type', args.state_type) self.state_type = args.state_type else: self.state_type = 'obs' # print('self.state_type', self.state_type) # Define the internal state self.steps = 0 r_matrix = [[1, 1], [1, 1]] self.payoff_values = [r_matrix for _ in range(self.episode_limit)] self.final_step_diff = [[1, 1], [1, 4]] self.branches = 4 self.branch = 0 self.state_num = self.branches * (self.episode_limit + 1) self.n_actions = len(self.payoff_values[0]) self.good_branches = args.good_branches
def __init__(self, batch_size=None, **kwargs): # Unpack arguments from sacred args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) self.args = args if getattr(args, "seed", None) is not None: self.seed = args.seed self.rs = np.random.RandomState(self.seed) # initialise numpy random state
def __init__(self, args): if isinstance(args, dict): args = convert(args) if isinstance(args.env_args, dict): args.env_args = convert(args.env_args) self.args = args # Unpack arguments from sacred self.device = self.args.device self.bs = self.args.batch_size_run # I treat randomness here different from other parts of the code. not sure whether that's bad/how bad that is self.rng = default_rng(seed=self.args.env_args.seed) self.extra_action = self.args.env_args.extra_action # Define the agents self.n_players = 2 self.n_agents = 2 if self.extra_action: self.size_A = 3 else: self.size_A = 2 self.n_obs = self.size_A self.n_actions = self.size_A self.r_success = self.args.env_args.r_success self.r_failure = self.args.env_args.r_failure self.n_comm_steps = self.args.env_args.n_comm_steps self.obs_size = 2 self.state_size = 5 self.episode_limit = self.args.env_args.episode_limit
def __init__(self, batch_size=None, **kwargs): # Unpack arguments from sacred args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) # Define the agents and actions self.n_agents = 2 self.n_actions = 3 self.episode_limit = 1 self.payoff_matrix = np.array([[8, -12, -12], [-12, 0, 0], [-12, 0, 0]]) self.state = np.ones(5)
def __init__(self, batch_size=None, **kwargs): # Unpack arguments from sacred args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) # Define the agents self.n_agents = 2 self.episode_limit = args.steps # Define the internal state self.steps = 0 r_matrix = [[1, 1], [1, 1]] self.payoff_values = [r_matrix for _ in range(self.episode_limit)] self.final_step_diff = [[1, 1], [1, 4]] self.branches = 4 self.branch = 0 self.n_actions = len(self.payoff_values[0]) self.good_branches = args.good_branches
def main(_config, _run): config = convert(_config) _id = _run._id # Logging stuff logger = logging.getLogger("Main") if config.mongo: logging.disable(logging.WARNING) configure_stats_logging( str(_id) + "_" + config.name, log_interval=config.log_interval, sacred_info=_run.info, use_tb=config.tb, ) stats = get_stats() logger.critical("ID: {}".format(_id)) # Update config with environment specific information env = gym.make(config.env) num_actions = env.action_space.n config = config._replace(num_actions=num_actions) state_shape = env.observation_space.shape config = config._replace(state_shape=state_shape) # Wrap env env = EnvWrapper(env, debug=True, args=config) # Log the config config_str = "Config:\n\n" for k, v in sorted(config._asdict().items()): config_str += " {}: {}\n".format(k, v) logger.critical(config_str) device = torch.device("cuda" if torch.cuda.is_available() else "cpu") logger.critical("Device: {}".format(device.type)) # Make agent and target agent agent = get_model(config.agent)(config) target_agent = get_model(config.agent)(config) target_agent.load_state_dict(agent.state_dict()) agent.to(device) target_agent.to(device) # Pseudocount stuff count_model = None if config.count_rewards: if config.atari_count: count_model = AtariCount(config) elif config.rnd_net_count: # assert config.count_state_only_rewards count_model = RndNetworkDistill(config, device) elif config.dora_count: count_model = DoraCount(config, device) else: count_model = PseudoCount(config) # Make action selector action_selector = None if config.action_selector == "eps_greedy": action_selector = eps_greedy.EpsGreedy(config) elif config.action_selector == "optimistic_action": action_selector = optimistic_action.OptimisticAction( count_model, config) elif config.action_selector == "bsp": action_selector = bsp_action.BSPAction(config) else: raise Exception("{} is not an Action Selector!".format( config.action_selector)) # Make replay buffer # Check if the obs dtype of the environment is an int obs_dtype = getattr(env.wrapped_env, "obs_dtype", np.float32) obs_scaling = getattr(env.wrapped_env, "obs_scaling", 1) replay_buffer = ReplayBuffer(size=config.buffer_size, frame_history_len=config.past_frames_input, obs_dtype=obs_dtype, obs_scaling=obs_scaling, args=config) if config.dora_count: dora_buffer = ReplayBuffer(size=config.batch_size * 4, frame_history_len=config.past_frames_input, obs_dtype=obs_dtype, obs_scaling=obs_scaling, args=config) # Make trainer trainer = None if config.trainer == "DQN": trainer = DQNTrainer(agent=agent, target_agent=target_agent, args=config, count_model=count_model, buffer=replay_buffer) else: raise Exception testing_buffer = ReplayBuffer(size=(config.past_frames_input + 1), frame_history_len=config.past_frames_input, args=config) # Testing stuff testing_env = EnvWrapper(env=gym.make(config.env), debug=True, args=config) if config.test_augmented: assert config.action_selector == "optimistic_action" # Player Positions positions = set() action_positions = set() T = 0 start_time = time.time() last_time = start_time # Lots of code duplication :( logging.critical("Filling buffer with {:,} random experiences.".format( config.buffer_burn_in)) state = env.reset() assert config.buffer_burn_in == 0 for t in range(config.buffer_burn_in): buffer_idx = replay_buffer.store_frame(state) stacked_states = replay_buffer.encode_recent_observation() tensor_state = torch.tensor(stacked_states, device=device).unsqueeze(0) action = np.random.randint(config.num_actions) next_state, reward, terminated, info = env.step(action) terminal_to_store = terminated if "Steps_Termination" in info and info["Steps_Termination"]: terminal_to_store = False intrinsic_reward = 0 pseudo_count = 0 if config.count_rewards: pseudo_count = count_model.visit(tensor_state, action) if getattr(count_model, "reward_directly", False): intrinsic_reward = pseudo_count else: count_bonus = config.count_beta / sqrt(pseudo_count) intrinsic_reward = count_bonus replay_buffer.store_effect(buffer_idx, action, reward - config.reward_baseline, intrinsic_reward, terminal_to_store, pseudo_count) state = next_state if terminated: state = env.reset() logger.warning("Random action burn in t: {:,}".format(t)) state = env.reset() episode = 0 episode_reward = 0 intrinsic_episode_reward = 0 episode_length = 0 env_positive_reward = 0 max_episode_reward = 0 if config.bsp: bsp_k = np.random.randint(config.bsp_k) action_selector.update_k(bsp_k) logging.critical("Beginning training.") while T < config.t_max: # Store the current state buffer_idx = replay_buffer.store_frame(state) if config.dora_count: dora_idx = dora_buffer.store_frame(state) # Get the stacked input vector stacked_states = replay_buffer.encode_recent_observation() # Get output from agent with torch.no_grad(): tensor_state = torch.tensor(stacked_states, device=device).unsqueeze(0) agent_output = agent(tensor_state) # agent_output = agent(torch.Tensor(stacked_states).unsqueeze(0)) # Select action action, action_info = action_selector.select_actions( agent_output, T, info={"state": tensor_state}) # Take an environment step next_state, reward, terminated, info = env.step(action) T += 1 stats.update_t(T) episode_reward += reward episode_length += 1 terminal_to_store = terminated if "Steps_Termination" in info and info["Steps_Termination"]: logger.warning("Terminating because of episode limit") terminal_to_store = False # Log if a positive reward was ever received from environment. ~Finding goal if reward > 0.1: env_positive_reward = 1 stats.update_stats("Positive_Reward", env_positive_reward) # Calculate count based intrinsic motivation intrinsic_reward = 0 pseudo_count = 0 if config.count_rewards: pseudo_count = count_model.visit(tensor_state, action) if getattr(count_model, "reward_directly", False): # The count-model is giving us the intrinsic reward directly intrinsic_reward = pseudo_count[0] else: # Count-model is giving us the pseudo-count count_bonus = config.count_beta / sqrt(pseudo_count) intrinsic_reward = count_bonus intrinsic_episode_reward += intrinsic_reward # Render training if config.render_train_env: debug_info = {} debug_info.update(action_info) env.render(debug_info=debug_info) # Add what happened to the buffer replay_buffer.store_effect(buffer_idx, action, reward - config.reward_baseline, intrinsic_reward, terminal_to_store, pseudo_count) if config.dora_count: dora_buffer.store_effect(dora_idx, action, reward - config.reward_baseline, intrinsic_reward, terminal_to_store, pseudo_count) # Update state state = next_state # If terminated if terminated: # If we terminated due to episode limit, we need to add the current state in if "Steps_Termination" in info and info["Steps_Termination"]: buffer_idx = replay_buffer.store_frame(state) replay_buffer.store_effect(buffer_idx, 0, 0, 0, True, 0, dont_sample=True) if config.dora_count: dora_idx = dora_buffer.store_frame(state) dora_buffer.store_effect(dora_idx, 0, 0, 0, True, 0, dont_sample=True) logger.warning("T: {:,}, Episode Reward: {:.2f}".format( T, episode_reward)) state = env.reset() max_episode_reward = max(max_episode_reward, episode_reward) stats.update_stats("Episode Reward", episode_reward) stats.update_stats("Max Episode Reward", max_episode_reward) stats.update_stats("Episode Length", episode_length) stats.update_stats("Intrin Eps Reward", intrinsic_episode_reward) episode_reward = 0 episode_length = 0 intrinsic_episode_reward = 0 episode += 1 stats.update_stats("Episode", episode) if config.bsp: bsp_k = np.random.randint(config.bsp_k) action_selector.update_k(bsp_k) # Train if possible for _ in range(config.training_iters): sampled_batch = None if T % config.update_freq != 0: # Only train every update_freq timesteps continue if replay_buffer.can_sample(config.batch_size): sampled_batch = replay_buffer.sample(config.batch_size, nstep=config.n_step) if sampled_batch is not None: trainer.train(sampled_batch) if config.dora_count: if dora_buffer.can_sample(config.batch_size): sampled_batch = replay_buffer.sample(config.batch_size, nstep=config.n_step) if sampled_batch is not None: count_model.train(sampled_batch) # Update target networks if necessary if T % config.target_update_interval == 0: trainer.update_target_agent() if config.dora_count: count_model.update_target_agent() # Logging if config.bsp: agent_output = agent_output[:, :, bsp_k] q_vals_numpy = agent_output.detach().cpu()[0].numpy() if num_actions < 20: for action_id in range(config.num_actions): stats.update_stats("Q-Value_{}".format(action_id), q_vals_numpy[action_id]) else: stats.update_stats("Q-Value_Mean", np.mean(q_vals_numpy)) player_pos = env.log_visitation() positions.add(player_pos) action_positions.add((player_pos, action)) stats.update_stats("States Visited", len(positions)) stats.update_stats("State_Actions Visited", len(action_positions)) stats.update_stats("Player Position", player_pos) # Log all env stats returned for k, v in info.items(): if k != "Steps_Termination": stats.update_stats(k, v) if config.save_count_gifs > 0 and T % config.save_count_gifs == 0: if count_model is not None: state_action_counts, count_nums = env.count_state_action_space( count_model) if state_action_counts is not None: save_image(state_action_counts, image_name="SA_Counts__{}_Size__{}_T".format( config.count_size, T), direc_name="State_Action_Counts") save_sa_count_vals(count_nums, name="SA_PCounts__{}_Size__{}_T".format( config.count_size, T), direc_name="Sa_Count_Estimates") actual_counts = env.state_counts() if actual_counts is not None: save_actual_counts(actual_counts, name="Counts__{}_T".format(T), direc_name="Actual_Counts") q_val_img, q_vals = env.q_value_estimates(count_model, agent) if q_val_img is not None: save_image(q_val_img, image_name="Q_Vals__{}_Size__{}_T".format( config.count_size, T), direc_name="Q_Value_Estimates") if q_vals is not None: save_q_vals(q_vals, name="Q_Vals__{}_Size__{}_T".format( config.count_size, T), direc_name="Q_Value_Estimates") # Testing with torch.no_grad(): if T % config.testing_interval == 0: prefixes = [""] if config.test_augmented: prefixes += ["Aug_"] for prefix in prefixes: total_test_reward = 0 total_test_length = 0 for _ in range(config.test_episodes): test_episode_reward = 0 test_episode_length = 0 test_state = testing_env.reset() test_env_terminated = False while not test_env_terminated: test_buffer_idx = testing_buffer.store_frame( test_state) stacked_test_states = testing_buffer.encode_recent_observation( ) test_tensor_state = torch.tensor( stacked_test_states, device=device).unsqueeze(0) testing_agent_output = agent(test_tensor_state) if prefix == "Aug_" or config.bsp: test_action, _ = action_selector.select_actions( testing_agent_output, T, info={"state": test_tensor_state}, testing=True) else: test_action = get_test_action( testing_agent_output, config) next_test_state, test_reward, test_env_terminated, _ = testing_env.step( test_action) if config.render_test_env: testing_env.render() test_episode_length += 1 test_episode_reward += test_reward testing_buffer.store_effect( test_buffer_idx, test_action, test_reward, 0, test_env_terminated, 0) test_state = next_test_state total_test_length += test_episode_length total_test_reward += test_episode_reward mean_test_reward = total_test_reward / config.test_episodes mean_test_length = total_test_length / config.test_episodes logger.error( "{}Testing -- T: {:,}/{:,}, Test Reward: {:.2f}, Test Length: {:,}" .format(prefix, T, config.t_max, mean_test_reward, mean_test_length)) stats.update_stats("{}Test Reward".format(prefix), mean_test_reward, always_log=True) stats.update_stats("{}Test Episode Length".format(prefix), mean_test_length, always_log=True) logger.error("Estimated time left: {}. Time passed: {}".format( time_left(last_time, T - config.testing_interval, T, config.t_max), time_str(time.time() - start_time))) last_time = time.time() if T % (config.log_interval * 4) == 0: stats.print_stats() logger.critical("Closing envs") env.close() testing_env.close() logger.critical("Finished training.") if client is not None: logger.critical("Attempting to close pymongo client") client.close() logger.critical("Pymongo client closed") logger.critical("Exiting")
def __init__(self, **kwargs): self.debug_launcher = False self.port_in_use = False self.debug_inputs = False self.debug_rewards = False if self.debug_launcher: print("INIT") args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) self._add_deepcopy_support() # Read arguments self.map_name = args.map_name assert map_present(self.map_name), \ "map {} not in map registry! please add.".format(self.map_name) map_params = convert(get_map_params(self.map_name)) self.map_type = map_params.map_type self.n_agents = map_params.n_agents self.n_enemies = map_params.n_enemies self._agent_race = map_params.agent_race self._bot_race = map_params.bot_race self.zealot_id = 65 self.dragoon_id = 66 self.episode_limit = map_params.limit self.micro_battles = map_params.micro_battles self._move_amount = args.move_amount self._step_mul = args.step_mul self.state_last_action = args.state_last_action # Rewards args self.reward_only_positive = args.reward_only_positive self.reward_negative_scale = args.reward_negative_scale self.reward_death_value = args.reward_death_value self.reward_win = args.reward_win self.reward_scale = args.reward_scale self.reward_scale_rate = args.reward_scale_rate # Other self.seed = args.seed self.heuristic = args.heuristic self.measure_fps = args.measure_fps self.continuing_episode = args.continuing_episode self.hostname = args.hostname self.port = portpicker.pick_unused_port() self.n_actions_no_attack = 6 self.n_actions = self.n_actions_no_attack + self.n_enemies self.max_reward = self.n_enemies * self.reward_death_value + self.reward_win for tc_dir in ["/install/torchcraft"]: if os.path.isdir(tc_dir): os.environ['TCPATH'] = tc_dir if sys.platform == 'linux': os.environ['SC1PATH'] = os.path.join(os.getcwd(), '3rdparty', 'StarCraftI', 'linux') self.env_file_type = 'so' elif sys.platform == 'darwin': os.environ['SC1PATH'] = os.path.join(os.getcwd(), '3rdparty', 'StarCraftI', 'mac') self.env_file_type = 'dylib' # Check if server has already been launched on this port s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: s.bind((socket.gethostbyname(socket.gethostname()), self.port)) self.port_in_use = False except socket.error as e: if e.errno == errno.EADDRINUSE: # Port is already in use self.port_in_use = True print("Exception error: Port {} already in use. \n".format( self.port, e)) else: # Something else raised the socket.error exception print(e) s.close() # For single-batch testing in BWAPILauncher rendering # self.port = 11111 # Needs to be commented out when using more than once SC1 instance if self.debug_launcher: print("BEFORE LAUNCH SERVER") # Launch the server if not self.port_in_use: self._launch_server() if self.debug_launcher: print("BEFORE LAUNCH CLIENT") # Launch the game self._launch_client() if self.debug_launcher: print("AFTER LAUNCH CLIENT") self.map_x = self._obs.map_size[0] self.map_y = self._obs.map_size[1] self.map_play_area_min = [int(0), int(0)] self.map_play_area_max = [self.map_x, self.map_y] self.max_distance_x = self.map_x self.max_distance_y = self.map_y self._episode_count = -1 self._total_steps = 0 self.battles_won = 0 self.battles_game = 0 self.timeouts = 0 self.force_restarts = 0
def __init__(self, **kwargs): args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) # Read arguments self.map_name = args.map_name assert map_present(self.map_name), \ "map {} not in map registry! please add.".format(self.map_name) map_params = convert(get_map_params(self.map_name)) self.n_agents = map_params.n_agents self.n_enemies = map_params.n_enemies self.episode_limit = map_params.limit self._move_amount = args.move_amount self._step_mul = args.step_mul self.difficulty = args.difficulty # Observations and state self.obs_own_health = args.obs_own_health self.obs_all_health = args.obs_all_health self.obs_instead_of_state = args.obs_instead_of_state self.state_last_action = args.state_last_action if self.obs_all_health: self.obs_own_health = True # Rewards args self.reward_sparse = args.reward_sparse self.reward_only_positive = args.reward_only_positive self.reward_negative_scale = args.reward_negative_scale self.reward_death_value = args.reward_death_value self.reward_win = args.reward_win self.reward_defeat = args.reward_defeat self.reward_scale = args.reward_scale self.reward_scale_rate = args.reward_scale_rate # Other self.continuing_episode = args.continuing_episode self.seed = args.seed self.heuristic = args.heuristic self.window_size = (1920, 1200) self.save_replay_prefix = args.save_replay_prefix # For sanity check self.debug_inputs = False self.debug_rewards = False # Actions self.n_actions_no_attack = 6 self.n_actions = self.n_actions_no_attack + self.n_enemies # Map info self._agent_race = map_params.a_race self._bot_race = map_params.b_race self.shield_bits_ally = 1 if self._agent_race == "P" else 0 self.shield_bits_enemy = 1 if self._bot_race == "P" else 0 self.unit_type_bits = map_params.unit_type_bits self.map_type = map_params.map_type if sys.platform == 'linux': os.environ.setdefault( 'SC2PATH', os.path.join(os.getcwd(), "3rdparty", 'StarCraftII')) self.game_version = args.game_version else: # Can be derived automatically self.game_version = None # Launch the game self._launch() self.max_reward = self.n_enemies * self.reward_death_value + self.reward_win self._game_info = self.controller.game_info() self._map_info = self._game_info.start_raw self.map_x = self._map_info.map_size.x self.map_y = self._map_info.map_size.y self.map_play_area_min = self._map_info.playable_area.p0 self.map_play_area_max = self._map_info.playable_area.p1 self.max_distance_x = self.map_play_area_max.x - self.map_play_area_min.x self.max_distance_y = self.map_play_area_max.y - self.map_play_area_min.y self.terrain_height = np.array(list( self._map_info.terrain_height.data)).reshape( self.map_x, self.map_y) self.pathing_grid = np.array(list( self._map_info.pathing_grid.data)).reshape(self.map_x, self.map_y) self._episode_count = 0 self._total_steps = 0 self.battles_won = 0 self.battles_game = 0 self.timeouts = 0 self.force_restarts = 0 self.last_stats = None
def __init__(self, batch_size=None, **kwargs): # Unpack arguments from sacred args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) self.args = args self.print_caught_prey = getattr(args, "print_caught_prey", False) self.print_frozen_agents = getattr(args, "print_frozen_agents", False) # Add-on for graph interface self.state_as_graph = args.state_as_graph if self.state_as_graph: self.absolute_distance = getattr(args, "absolute_distance", False) self.normalise_distance = getattr(args, "normalise_distance", False) self.add_walls = getattr(args, "add_walls", False) self.prey_relational = getattr(args, "prey_relational", True) # Add-on for goat-hunts (which like to climb mountains) self.mountain_slope = getattr(args, "mountain_slope", 0.0) self.capture_conditions = getattr(args, "capture_conditions", [0, 1]) self.mountain_spawn = getattr(args, "mountain_spawn", False) self.mountain_agent_row = getattr(args, "mountain_agent_row", -1) # Downwards compatibility of batch_mode self.batch_mode = batch_size is not None self.batch_size = batch_size if self.batch_mode else 1 # Define the environment grid self.truncate_episodes = getattr(args, "truncate_episodes", True) self.observe_ids = getattr(args, "observe_ids", False) self.intersection_global_view = getattr(args, "intersection_global_view", False) self.intersection_unknown = getattr(args, "intersection_unknown", False) self.directed_observations = getattr(args, "directed_observations", False) self.directed_cone_narrow = getattr(args, "directed_cone_narrow", True) self.directed_exta_actions = getattr(args, "directed_exta_actions", True) self.random_ghosts = getattr(args, "random_ghosts", False) self.random_ghosts_prob = getattr(args, "random_ghosts_prob", 0.5) self.random_ghosts_mul = getattr(args, "random_ghosts_mul", -1.0) self.random_ghosts_random_indicator = getattr( args, "random_ghosts_indicator", False) self.observe_state = getattr(args, "observe_state", False) self.observe_walls = getattr(args, "observe_walls", True) self.observe_one_hot = getattr(args, "observe_one_hot", False) self.n_feats = (5 if self.observe_one_hot else 3) + (1 if self.random_ghosts else 0) self.toroidal = args.toroidal shape = args.world_shape self.x_max, self.y_max = shape self.state_size = self.x_max * self.y_max * self.n_feats self.env_max = np.asarray(shape, dtype=int_type) self.grid_shape = np.asarray(shape, dtype=int_type) self.grid = np.zeros( (self.batch_size, self.x_max, self.y_max, self.n_feats), dtype=float_type) # 0=agents, 1=stag, 2=hare, [3=wall, 4=unknown], [-1=ghost-indicator] if self.random_ghosts: self.ghost_indicator = False # indicator whether whether prey is a ghost (True) or not (False) self.ghost_indicator_potential_positions = np.asarray( [[0, 0], [0, self.x_max - 1], [self.y_max - 1, 0], [self.y_max - 1, self.x_max - 1]], dtype=int_type) self.ghost_indicator_pos = [ 0, 0 ] # position of the indicator whether prey is a ghost (-1) or not (+1) # Define the agents and their action space self.capture_action = getattr(args, "capture_action", False) self.capture_action_conditions = getattr(args, "capture_action_conditions", (2, 1)) self.actions = np.asarray([[0, 1], [1, 0], [0, -1], [-1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], dtype=int_type) self.action_names = [ "right", "down", "left", "up", "stay", "catch", 'look-right', 'look-down', 'look-left', 'look-up' ] self.agent_move_block = np.asarray(getattr(args, "agent_move_block", [0]), dtype=int_type) self.n_actions = 10 if self.directed_observations and self.directed_exta_actions \ else (6 if self.capture_action else 5) self.n_agents = args.n_agents self.n_stags = args.n_stags self.p_stags_rest = args.p_stags_rest self.n_hare = args.n_hare self.p_hare_rest = args.p_hare_rest self.n_prey = self.n_stags + self.n_hare self.agent_obs = args.agent_obs self.agent_obs_dim = np.asarray(self.agent_obs, dtype=int_type) if self.observe_state: # The size of the global state as observation (with one additional position feature) self.obs_size = int(self.state_size + self.grid_shape[0] * self.grid_shape[1]) elif self.directed_observations and self.directed_cone_narrow: # The size of the visible observation cones for this option self.obs_size = self.n_feats * (2 * args.agent_obs[0] - 1) * (2 * args.agent_obs[1] - 1) else: # The agent-centric observation size self.obs_size = self.n_feats * (2 * args.agent_obs[0] + 1) * (2 * args.agent_obs[1] + 1) # Define the episode and rewards self.episode_limit = args.episode_limit self.time_reward = getattr(args, "reward_time", -0.1) self.collision_reward = getattr(args, "reward_collision", 0.0) self.capture_hare_reward = getattr(args, "reward_hare", 1.0) self.capture_stag_reward = getattr(args, "reward_stag", 2.0) self.miscapture_punishment = float( getattr(args, "miscapture_punishment", -self.capture_stag_reward)) self.capture_terminal = getattr(args, "capture_terminal", True) self.capture_freezes = getattr(args, "capture_freezes", True) self.remove_frozen = getattr(args, "remove_frozen", False) # Define the internal state self.agents = np.zeros((self.n_agents, self.batch_size, 2), dtype=int_type) self.agents_not_frozen = np.ones((self.n_agents, self.batch_size), dtype=int_type) self.agents_orientation = np.zeros( (self.n_agents, self.batch_size), dtype=int_type) # use action_labels 0..3 self.prey = np.zeros((self.n_prey, self.batch_size, 2), dtype=int_type) self.prey_alive = np.zeros((self.n_prey, self.batch_size), dtype=int_type) self.prey_type = np.ones((self.n_prey, self.batch_size), dtype=int_type) # fill with stag (1) self.prey_type[self.n_stags:, :] = 2 # set hares to 2 self.steps = 0 self.sum_rewards = 0 self.reset() self.made_screen = False self.scaling = 5
'reward_hare': 1, 'reward_stag': 10, 'reward_collision': 0.0, 'reward_time': -0.1, 'capture_terminal': True, 'episode_limit': 200, 'n_stags': 2, 'p_stags_rest': 0.1, 'n_hare': 4, 'p_hare_rest': 0.5, 'n_agents': 4, 'agent_obs': (2, 2), 'state_as_graph': False, 'print_caught_prey': True } env_args = convert(env_args) print(env_args) env = StagHunt(env_args=env_args) [all_obs, state] = env.reset() print("Env is ", "batched" if env.batch_mode else "not batched") if False: grid = state.reshape((6, 6, 3)) for i in range(grid.shape[2]): print(grid[:, :, i], '\n') if False: print(state) for i in range(env.n_agents): print(all_obs[i])
def __init__(self, **kwargs): args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) self.flounderl_delegate_if_zero_ck = getattr(kwargs["args"], "flounderl_delegate_if_zero_ck", False) self.map_param_registry = kwargs.get("map_param_registry", map_param_registry) # Read arguments self.map_name = args.map_name assert self.map_name in map_param_registry, \ "map {} not in map registry! please add.".format(self.map_name) self.n_agents = map_param_registry[self.map_name]["n_agents"] self.n_enemies = map_param_registry[self.map_name]["n_enemies"] self.episode_limit = map_param_registry[self.map_name]["limit"] self._move_amount = args.move_amount self._step_mul = args.step_mul self.difficulty = args.difficulty self.state_last_action = args.state_last_action # Rewards args self.reward_only_positive = args.reward_only_positive self.reward_negative_scale = args.reward_negative_scale self.reward_death_value = args.reward_death_value self.reward_damage_coef = args.reward_damage_coef self.reward_win = args.reward_win self.reward_scale = args.reward_scale self.reward_scale_rate = args.reward_scale_rate # Other self.seed = args.seed self.heuristic = args.heuristic self.measure_fps = args.measure_fps self.obs_ignore_ally = args.obs_ignore_ally if hasattr(args, "obs_ignore_ally") else False self.obs_instead_of_state = args.obs_instead_of_state if hasattr(args, "obs_instead_of_state") else False self.window_size = (1920, 1200) self.debug_inputs = False self.debug_rewards = False self.debug_action_result = False self.n_actions_no_attack = 6 self.n_actions = self.n_actions_no_attack + self.n_enemies self.fully_observable = args.fully_observable if hasattr(args, "fully_observable") else False self.relax_pairwise_aa = args.relax_pairwise_aa if hasattr(args, "relax_pairwise_aa") else False self.continuing_episode = args.continuing_episode self.map_settings() if sys.platform == 'linux': self.game_version = args.game_version if hasattr(args, "game_version") else "3.16.1" if os.path.exists(os.path.join(os.getcwd(), "3rdparty", 'StarCraftII__{}'.format(self.game_version))): os.environ['SC2PATH'] = os.path.join(os.getcwd(), "3rdparty", 'StarCraftII__{}'.format(self.game_version)) else: os.environ['SC2PATH'] = os.path.join(os.getcwd(), "3rdparty", 'StarCraftII') else: self.game_version = "4.3.2" # Launch the game self._launch() self.max_reward = self.n_enemies * self.reward_death_value + self.reward_win self._game_info = self.controller.game_info() self.map_x = self._game_info.start_raw.map_size.x self.map_y = self._game_info.start_raw.map_size.y self.map_play_area_min = self._game_info.start_raw.playable_area.p0 self.map_play_area_max = self._game_info.start_raw.playable_area.p1 self.max_distance_x = self.map_play_area_max.x - self.map_play_area_min.x self.max_distance_y = self.map_play_area_max.y - self.map_play_area_min.y self._episode_count = 0 self._total_steps = 0 self.battles_won = 0 self.battles_game = 0 self.timeouts = 0 self.force_restarts = 0
def __init__(self, batch_size=None, **kwargs): #获取所有参数 args = kwargs if isinstance(args, dict): args = convert(args) self.args = args self.print_caught_prey = getattr(args, "print_caught_prey", False) self.print_frozen_agents = getattr(args, "print_frozen_agents", False) # 附加图形界面 ##插件1 self.state_as_graph = args.state_as_graph if self.state_as_graph: self.absolute_distance = getattr(args, "absolute_distance", False) self.normalise_distance = getattr(args, "normalise_distance", False) self.add_walls = getattr(args, "add_walls", False) self.prey_relational = getattr(args, "prey_relational", True) # 附加 山羊狩猎(喜欢爬山), #插件2 self.mountain_slope = getattr(args, "mountain_slope", 0.0) self.capture_conditions = getattr(args, "capture_conditions", [0, 1]) self.mountain_spawn = getattr(args, "mountain_spawn", False) self.mountain_agent_row = getattr(args, "mountain_agent_row", -1) # Batch_Mode向下兼容性 self.batch_mode = batch_size is not None self.batch_size = batch_size if self.batch_mode else 1 # 定义环境网格 self.truncate_episodes = getattr(args, "truncate_episodes", True) self.observe_ids = getattr(args, "observe_ids", False) self.intersection_global_view = getattr(args, "intersection_global_view", False) self.intersection_unknown = getattr(args, "intersection_unknown", False) self.directed_observations = getattr(args, "directed_observations", False) self.directed_cone_narrow = getattr(args, "directed_cone_narrow", True) self.directed_exta_actions = getattr(args, "directed_exta_actions", True) self.random_ghosts = getattr(args, "random_ghosts", False) self.random_ghosts_prob = getattr(args, "random_ghosts_prob", 0.5) self.random_ghosts_mul = getattr(args, "random_ghosts_mul", -1.0) self.random_ghosts_random_indicator = getattr( args, "random_ghosts_indicator", False) self.observe_state = getattr(args, "observe_state", False) self.observe_walls = getattr(args, "observe_walls", True) self.observe_one_hot = getattr(args, "observe_one_hot", False) self.n_feats = (5 if self.observe_one_hot else 3) + (1 if self.random_ghosts else 0) self.toroidal = args.toroidal shape = args.world_shape self.x_max, self.y_max = shape self.state_size = self.x_max * self.y_max * self.n_feats #eg:state_size: 300 self.env_max = np.asarray(shape, dtype=int_type) self.grid_shape = np.asarray(shape, dtype=int_type) self.grid = np.zeros( (self.batch_size, self.x_max, self.y_max, self.n_feats), dtype=float_type) # 0=agents, 1=stag, 2=hare, [3=wall, 4=unknown], [-1=ghost-indicator] # 如果是 "True",猎物会随机变成幽灵(否定的奖励),由一个角落的特征来表示 if self.random_ghosts: self.ghost_indicator = False # indicator whether whether prey is a ghost (True) or not (False) self.ghost_indicator_potential_positions = np.asarray( [[0, 0], [0, self.x_max - 1], [self.y_max - 1, 0], [self.y_max - 1, self.x_max - 1]], dtype=int_type) self.ghost_indicator_pos = [ 0, 0 ] # position of the indicator whether prey is a ghost (-1) or not (+1) # 定义agent及其动作空间 self.capture_action = getattr(args, "capture_action", False) self.capture_action_conditions = getattr(args, "capture_action_conditions", (2, 1)) self.actions = np.asarray([[0, 1], [1, 0], [0, -1], [-1, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0], [0, 0]], dtype=int_type) self.action_names = [ "right", "down", "left", "up", "stay", "catch", 'look-right', 'look-down', 'look-left', 'look-up' ] self.agent_move_block = np.asarray(getattr(args, "agent_move_block", [0]), dtype=int_type) self.n_actions = 10 if self.directed_observations and self.directed_exta_actions \ else (6 if self.capture_action else 5) self.n_agents = args.n_agents self.n_stags = args.n_stags self.p_stags_rest = args.p_stags_rest self.n_hare = args.n_hare self.p_hare_rest = args.p_hare_rest self.n_prey = self.n_stags + self.n_hare self.agent_obs = args.agent_obs self.agent_obs_dim = np.asarray(self.agent_obs, dtype=int_type) if self.observe_state: # 作为观察的全局状态的大小(有一个额外的位置特征) self.obs_size = int(self.state_size + self.grid_shape[0] * self.grid_shape[1]) elif self.directed_observations and self.directed_cone_narrow: # The size of the visible observation cones for this option self.obs_size = self.n_feats * (2 * args.agent_obs[0] - 1) * (2 * args.agent_obs[1] - 1) else: # The agent-centric observation size,观察空间的维度eg: 75 self.obs_size = self.n_feats * (2 * args.agent_obs[0] + 1) * (2 * args.agent_obs[1] + 1) # 定义episode和奖励 self.episode_limit = args.episode_limit self.time_reward = getattr(args, "reward_time", -0.1) self.collision_reward = getattr(args, "reward_collision", 0.0) self.capture_hare_reward = getattr(args, "reward_hare", 1.0) self.capture_stag_reward = getattr(args, "reward_stag", 2.0) self.miscapture_punishment = float( getattr(args, "miscapture_punishment", -self.capture_stag_reward)) self.capture_terminal = getattr(args, "capture_terminal", True) self.capture_freezes = getattr(args, "capture_freezes", True) self.remove_frozen = getattr(args, "remove_frozen", False) # 定义内部状态, agents: 猎人, self.agents = np.zeros((self.n_agents, self.batch_size, 2), dtype=int_type) self.agents_not_frozen = np.ones((self.n_agents, self.batch_size), dtype=int_type) self.agents_orientation = np.zeros( (self.n_agents, self.batch_size), dtype=int_type) # use action_labels 0..3 # 猎物prey,包括兔子和雄鹿 self.prey = np.zeros((self.n_prey, self.batch_size, 2), dtype=int_type) self.prey_alive = np.zeros((self.n_prey, self.batch_size), dtype=int_type) self.prey_type = np.ones((self.n_prey, self.batch_size), dtype=int_type) # fill with stag (1) self.prey_type[self.n_stags:, :] = 2 # set hares to 2 self.steps = 0 self.sum_rewards = 0 self.reset() self.made_screen = False self.scaling = 5
def run(_run, _config, _log, pymongo_client): # check args sanity _config = args_sanity_check(_config, _log) # convert _config dict to GenericDict objects (which cannot be overwritten later) args = convert(_config) _log.info("Experiment Parameters:") experiment_params = pprint.pformat(_config, indent=4, width=1) _log.info("\n\n" + experiment_params + "\n") import os _log.info("OS ENVIRON KEYS: {}".format(os.environ)) if _config.get("debug_mode", None) is not None: _log.warning("ATTENTION DEBUG MODE: {}".format(_config["debug_mode"])) # ----- configure logging # configure tensorboard logger unique_token = "{}__{}".format(args.name, datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")) if args.use_tensorboard: import tensorboard if tensorboard: from tensorboard_logger import configure, log_value import os from os.path import dirname, abspath file_tb_path = os.path.join(dirname(dirname(abspath(__file__))), "tb_logs") configure(os.path.join(file_tb_path, "{}").format(unique_token)) # configure trajectory logger # set up logging object to be passed on from now on logging_struct = SN(py_logger=_log, sacred_log_scalar_fn=partial(append_scalar, run=_run)) if args.use_tensorboard: logging_struct.tensorboard_log_scalar_fn=log_value if hasattr(args, "use_hdf_logger") and args.use_hdf_logger: logging_struct.hdf_logger = HDFLogger(path=args.local_results_path, name=args.name, logging_struct=logging_struct) # ----- execute runners # run framework in run_mode selected if args.run_mode in ["parallel_subproc"]: run_parallel(args=args, _run=_run, _logging_struct=logging_struct, unique_token=unique_token) else: run_sequential(args=args, _run=_run, _logging_struct=logging_struct, unique_token=unique_token) #Clean up after finishing print("Exiting Main") if pymongo_client is not None: #args.use_mongodb: print("Attempting to close mongodb client") pymongo_client.close() print("Mongodb client closed") print("Stopping all threads") for t in threading.enumerate(): if t.name != "MainThread": print("Thread {} is alive! Is daemon: {}".format(t.name, t.daemon)) t.join(timeout=1) print("Thread joined") print("Exiting script") # Making sure framework really exits os._exit(os.EX_OK)
def __init__(self, batch_size=None, **kwargs): # Unpack arguments from sacred args = kwargs["env_args"] if isinstance(args, dict): args = convert(args) # Primary config self.scenario = getattr(args, "scenario", "11_vs_11_stochastic") self.game_visibility = getattr(args, "game_visibility", "full") self.representation = getattr(args, "representation", "simple115") self.full_obs_flag = getattr(args, "full_obs", False) self.view_angle = getattr(args, "view_angle", 160) self.rewards = getattr(args, "rewards", "scoring") # Video dumping config self.write_full_episode_dumps = getattr(args, "write_full_episode_dumps", False) self.write_video = getattr(args, "write_video", False) self.dump_frequency = getattr(args, "dump_frequency", 1) self.logdir = getattr(args, "logdir", "episode_dumps") # Environment modifiers self.move_goalkeeper = getattr(args, "move_goalkeeper", False) self.difficulty_override = getattr(args, "env_difficulty", -1) # Secondary config scenario_config = {"11_vs_11_stochastic": {"n_agents": 11}, "academy_empty_goal_close": {"n_agents": 1}, "academy_empty_goal": {"n_agents": 1}, "academy_run_to_score": {"n_agents": 1}, "academy_run_to_score_with_keeper": {"n_agents": 1}, "academy_pass_and_shoot_with_keeper": {"n_agents": 2}, "academy_run_pass_and_shoot_with_keeper": {"n_agents": 2}, "academy_3_vs_1_with_keeper": {"n_agents": 3}, "academy_corner": {"n_agents": 1}, "academy_counterattack_easy": {"n_agents": 4}, "academy_single_goal_versus_lazy": {"n_agents": 11} } if getattr(args, "n_agents", -1) == -1: self.n_agents = scenario_config[self.scenario]["n_agents"] else: assert args.n_agents <= scenario_config[self.scenario]["n_agents"], \ "Scenario only supports up to {} agents - you supplied {}!".format( scenario_config[self.scenario]["n_agents"], args.n_agents) self.n_agents = args.n_agents self.episode_limit = args.episode_limit if getattr(args, "episode_limit", -1) != -1 else 1000 # TODO: Look up correct episode length! self.observation_reference_frame = getattr(args, "observation_reference_frame", "fixed") self.action_set = getattr(args, "action_set", 'default') # Either 19 or non sticky self.n_actions = 19 if self.action_set is not 'non_sticky' else 14 self.env = football_env.create_environment( env_name=self.scenario, render=False, number_of_left_players_agent_controls=self.n_agents, representation=self.representation, rewards=self.rewards, write_full_episode_dumps=self.write_full_episode_dumps, write_video=self.write_video, dump_frequency=self.dump_frequency, logdir=self.logdir, # po_view_cone_xy_opening=self.view_angle, # full_obs_flag=self.full_obs_flag, action_set=self.action_set, ) self.reset() self.obs_size = self.observations[0].shape self.state = self.env.get_global_state() self.state_size = self.state.shape