Exemplo n.º 1
0
        actions[ag] = t.argmax(agents[ag].final_step())

    if is_warm_up:
        # generate random actions
        act_dim = env.get_action_space(group_handle)[0]
        actions = np.random.randint(0, act_dim, agent_num, dtype=np.int32)

    env.set_action(group_handle, actions)


if __name__ == "__main__":
    total_steps = max_epochs * max_episodes * max_steps

    # preparations
    prep_dirs_default(root_dir)
    logger.info("Directories prepared.")
    global_board.init(log_dir + "train_log")
    writer = global_board.writer

    env = magent.GridWorld(generate_combat_config(map_size), map_size=map_size)
    agent_num = int(np.sqrt(map_size * map_size * agent_ratio))**2
    group1_handle, group2_handle = env.get_handles()

    # shape: (act,)
    action_dim = env.get_action_space(group1_handle)[0]
    # shape: (view_width, view_height, n_channel)
    view_space = env.get_view_space(group1_handle)
    view_dim = np.prod(view_space)
    # shape: (ID embedding + last action + last reward + relative pos)
    feature_dim = env.get_feature_space(group1_handle)[0]
Exemplo n.º 2
0

def policy_noise(action):
    return t.clamp(
        add_clipped_normal_noise_to_action(action, c.policy_noise_params), -1,
        1)


if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    actor_t = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    critic = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic_t = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic2 = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic2_t = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)

    logger.info("Networks created")
Exemplo n.º 3
0
c.learning_rate = 3e-4
c.entropy_weight = None
c.ppo_update_batch_size = 100
c.ppo_update_times = 50
c.ppo_update_int = 5  # = the number of episodes stored in ppo replay buffer
c.model_save_int = c.ppo_update_int * 20  # in episodes
c.profile_int = 50  # in episodes

if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    critic = MW(Critic(observe_dim).to(c.device), c.device, c.device)
    actor.share_memory()
    critic.share_memory()
    logger.info("Networks created")

    # default replay buffer storage is main cpu mem
    # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu,
    ppo = PPO(actor,
              critic,
              t.optim.Adam,
              nn.MSELoss(reduction='sum'),
              replay_device=c.device,
Exemplo n.º 4
0
def load_framework(name):
    module = imp.import_module(".magent_" + name)
    return module.c, module.create_models, module.run_agents


if __name__ == "__main__":
    c1, create_models1, run_agents1 = load_framework(load_framework1)
    c2, create_models2, run_agents2 = load_framework(load_framework1)
    save_env1 = SaveEnv(c1.root_dir, restart_use_trial=load_trial1)
    prep_args(c1, save_env1)
    save_env2 = SaveEnv(c2.root_dir, restart_use_trial=load_trial2)
    prep_args(c2, save_env2)

    c1.restart_from_trial = load_trial1
    framework1 = create_models1()
    logger.info("Framework 1 initialized")

    c2.restart_from_trial = load_trial2
    framework2 = create_models2()
    logger.info("Framework 2 initialized")

    operators = [(framework1, run_agents1, load_framework1),
                 (framework2, run_agents2, load_framework2)]

    # testing
    # preparations
    config = generate_combat_config(map_size)
    env = magent.GridWorld(config, map_size=map_size)
    env.reset()

    global_board.init(test_root_dir)
Exemplo n.º 5
0
c.learning_rate = 1e-3
c.entropy_weight = 1e-2
c.ppo_update_batch_size = 100
c.ppo_update_times = 4
c.ppo_update_int = 6  # = the number of episodes stored in ppo replay buffer
c.model_save_int = 100  # in episodes
c.profile_int = 50  # in episodes

if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(Actor(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic = MW(Critic(observe_dim).to(c.device), c.device, c.device)
    logger.info("Networks created")

    # default replay buffer storage is main cpu mem
    # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu,
    ppo = PPO(actor,
              critic,
              t.optim.Adam,
              nn.MSELoss(reduction='sum'),
              replay_device=c.device,
              replay_size=c.replay_size,
              entropy_weight=c.entropy_weight,
              discount=c.discount,
Exemplo n.º 6
0
                                 neighbor_num, True, device)
    negotiator_t = SwarmNegotiator(observe_dim, action_dim, history_depth,
                                   neighbor_num, True, device)

    # currently, all agents share the same two networks
    # TODO: implement K-Sub policies
    actor = WrappedActorNet(base_actor, negotiator)
    actor_t = WrappedActorNet(base_actor_t, negotiator_t)
    critic = WrappedCriticNet(
        SwarmCritic(observe_dim, action_dim, history_depth, neighbor_num,
                    device))
    critic_t = WrappedCriticNet(
        SwarmCritic(observe_dim, action_dim, history_depth, neighbor_num,
                    device))

    logger.info("Networks created")

    # only used to load model
    ddpg = DDPG(actor, actor_t, critic, critic_t, t.optim.Adam,
                nn.MSELoss(reduction='sum'), device)

    ddpg.load(load_dir, save_map)
    logger.info("DDPG framework initialized")

    # evaluation
    # preparations
    env = BipedalMultiCarrier(agent_num)
    agents = [
        SwarmAgent(base_actor,
                   negotiator,
                   len(neighbors),