actions[ag] = t.argmax(agents[ag].final_step()) if is_warm_up: # generate random actions act_dim = env.get_action_space(group_handle)[0] actions = np.random.randint(0, act_dim, agent_num, dtype=np.int32) env.set_action(group_handle, actions) if __name__ == "__main__": total_steps = max_epochs * max_episodes * max_steps # preparations prep_dirs_default(root_dir) logger.info("Directories prepared.") global_board.init(log_dir + "train_log") writer = global_board.writer env = magent.GridWorld(generate_combat_config(map_size), map_size=map_size) agent_num = int(np.sqrt(map_size * map_size * agent_ratio))**2 group1_handle, group2_handle = env.get_handles() # shape: (act,) action_dim = env.get_action_space(group1_handle)[0] # shape: (view_width, view_height, n_channel) view_space = env.get_view_space(group1_handle) view_dim = np.prod(view_space) # shape: (ID embedding + last action + last reward + relative pos) feature_dim = env.get_feature_space(group1_handle)[0]
def policy_noise(action): return t.clamp( add_clipped_normal_noise_to_action(action, c.policy_noise_params), -1, 1) if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) actor_t = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) critic = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic_t = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic2 = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) critic2_t = MW( Critic(observe_dim, action_dim).to(c.device), c.device, c.device) logger.info("Networks created")
c.learning_rate = 3e-4 c.entropy_weight = None c.ppo_update_batch_size = 100 c.ppo_update_times = 50 c.ppo_update_int = 5 # = the number of episodes stored in ppo replay buffer c.model_save_int = c.ppo_update_int * 20 # in episodes c.profile_int = 50 # in episodes if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW( Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device) critic = MW(Critic(observe_dim).to(c.device), c.device, c.device) actor.share_memory() critic.share_memory() logger.info("Networks created") # default replay buffer storage is main cpu mem # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu, ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device,
def load_framework(name): module = imp.import_module(".magent_" + name) return module.c, module.create_models, module.run_agents if __name__ == "__main__": c1, create_models1, run_agents1 = load_framework(load_framework1) c2, create_models2, run_agents2 = load_framework(load_framework1) save_env1 = SaveEnv(c1.root_dir, restart_use_trial=load_trial1) prep_args(c1, save_env1) save_env2 = SaveEnv(c2.root_dir, restart_use_trial=load_trial2) prep_args(c2, save_env2) c1.restart_from_trial = load_trial1 framework1 = create_models1() logger.info("Framework 1 initialized") c2.restart_from_trial = load_trial2 framework2 = create_models2() logger.info("Framework 2 initialized") operators = [(framework1, run_agents1, load_framework1), (framework2, run_agents2, load_framework2)] # testing # preparations config = generate_combat_config(map_size) env = magent.GridWorld(config, map_size=map_size) env.reset() global_board.init(test_root_dir)
c.learning_rate = 1e-3 c.entropy_weight = 1e-2 c.ppo_update_batch_size = 100 c.ppo_update_times = 4 c.ppo_update_int = 6 # = the number of episodes stored in ppo replay buffer c.model_save_int = 100 # in episodes c.profile_int = 50 # in episodes if __name__ == "__main__": save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial) prep_args(c, save_env) # save_env.remove_trials_older_than(diff_hour=1) global_board.init(save_env.get_trial_train_log_dir()) writer = global_board.writer logger.info("Directories prepared.") actor = MW(Actor(observe_dim, action_dim).to(c.device), c.device, c.device) critic = MW(Critic(observe_dim).to(c.device), c.device, c.device) logger.info("Networks created") # default replay buffer storage is main cpu mem # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu, ppo = PPO(actor, critic, t.optim.Adam, nn.MSELoss(reduction='sum'), replay_device=c.device, replay_size=c.replay_size, entropy_weight=c.entropy_weight, discount=c.discount,
neighbor_num, True, device) negotiator_t = SwarmNegotiator(observe_dim, action_dim, history_depth, neighbor_num, True, device) # currently, all agents share the same two networks # TODO: implement K-Sub policies actor = WrappedActorNet(base_actor, negotiator) actor_t = WrappedActorNet(base_actor_t, negotiator_t) critic = WrappedCriticNet( SwarmCritic(observe_dim, action_dim, history_depth, neighbor_num, device)) critic_t = WrappedCriticNet( SwarmCritic(observe_dim, action_dim, history_depth, neighbor_num, device)) logger.info("Networks created") # only used to load model ddpg = DDPG(actor, actor_t, critic, critic_t, t.optim.Adam, nn.MSELoss(reduction='sum'), device) ddpg.load(load_dir, save_map) logger.info("DDPG framework initialized") # evaluation # preparations env = BipedalMultiCarrier(agent_num) agents = [ SwarmAgent(base_actor, negotiator, len(neighbors),