Пример #1
0
def create_models():
    config = generate_combat_config(c.map_size)
    view_shape, feature_shape, action_dim = get_agent_io_shapes(config, c.map_size)[0]

    actor = MW(Actor(view_shape, feature_shape,
                     action_dim, c.conv).to(c.device), c.device, c.device)
    critic = MW(Critic(view_shape, feature_shape,
                       c.conv).to(c.device), c.device, c.device)

    actor.share_memory()
    critic.share_memory()

    ppo = PPO(actor, critic,
              t.optim.Adam, nn.MSELoss(reduction='sum'),
              replay_device=c.storage_device,
              replay_size=c.replay_size,
              entropy_weight=c.entropy_weight,
              discount=c.discount,
              update_times=c.ppo_update_times,
              batch_size=c.ppo_update_batch_size,
              learning_rate=c.learning_rate)

    if c.restart_from_trial is not None:
        ppo.load(save_env.get_trial_model_dir())

    return ppo
Пример #2
0
def create_models():
    config = generate_combat_config(c.map_size)
    view_shape, feature_shape, action_dim = get_agent_io_shapes(
        config, c.map_size)[0]

    actor = MW(
        Actor(view_shape, feature_shape, action_dim, c.conv).to(c.device),
        c.device, c.device)
    actor_t = MW(
        Actor(view_shape, feature_shape, action_dim, c.conv).to(c.device),
        c.device, c.device)
    critic = MW(
        Critic(view_shape, feature_shape, c.conv).to(c.device), c.device,
        c.device)
    critic_t = MW(
        Critic(view_shape, feature_shape, c.conv).to(c.device), c.device,
        c.device)

    ddpg = HDDPG(actor,
                 actor_t,
                 critic,
                 critic_t,
                 t.optim.Adam,
                 nn.MSELoss(reduction='sum'),
                 q_increase_rate=c.q_increase_rate,
                 q_decrease_rate=c.q_decrease_rate,
                 discount=0.99,
                 update_rate=0.005,
                 batch_size=c.ddpg_update_batch_size,
                 learning_rate=0.001,
                 replay_size=c.replay_size,
                 replay_device=c.storage_device)

    if c.restart_from_trial is not None:
        ddpg.load(save_env.get_trial_model_dir())

    return ddpg
Пример #3
0
def policy_noise(action):
    return t.clamp(
        add_clipped_normal_noise_to_action(action, c.policy_noise_params), -1,
        1)


if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    actor_t = MW(
        Actor(observe_dim, action_dim, 1).to(c.device), c.device, c.device)
    critic = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic_t = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic2 = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic2_t = MW(
        Critic(observe_dim, action_dim).to(c.device), c.device, c.device)

    logger.info("Networks created")

    # default replay buffer storage is main cpu mem
    # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu,
Пример #4
0
c.ddpg_update_batch_size = 100
c.ddpg_warmup_steps = 2000
c.ddpg_average_target_int = 200
c.model_save_int = 100  # in episodes
c.profile_int = 50  # in episodes

if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(Actor(observe_dim, action_dim, 1))
    actor_t = MW(Actor(observe_dim, action_dim, 1))
    critic = MW(Critic(c.agent_num, observe_dim, action_dim))
    critic_t = MW(Critic(c.agent_num, observe_dim, action_dim))
    logger.info("Networks created")

    ddpg = MADDPG(c.agent_num,
                  actor,
                  actor_t,
                  critic,
                  critic_t,
                  t.optim.Adam,
                  nn.MSELoss(reduction='sum'),
                  sub_policy_num=c.sub_policy_num,
                  discount=0.99,
                  update_rate=0.005,
Пример #5
0
c.ppo_update_batch_size = 100
c.ppo_update_times = 4
c.ppo_update_int = 6  # = the number of episodes stored in ppo replay buffer
c.model_save_int = 100  # in episodes
c.profile_int = 50  # in episodes

if __name__ == "__main__":
    save_env = SaveEnv(c.root_dir, restart_use_trial=c.restart_from_trial)
    prep_args(c, save_env)

    # save_env.remove_trials_older_than(diff_hour=1)
    global_board.init(save_env.get_trial_train_log_dir())
    writer = global_board.writer
    logger.info("Directories prepared.")

    actor = MW(Actor(observe_dim, action_dim).to(c.device), c.device, c.device)
    critic = MW(Critic(observe_dim).to(c.device), c.device, c.device)
    logger.info("Networks created")

    # default replay buffer storage is main cpu mem
    # when stored in main mem, takes about 0.65e-3 sec to move result from gpu to cpu,
    ppo = PPO(actor,
              critic,
              t.optim.Adam,
              nn.MSELoss(reduction='sum'),
              replay_device=c.device,
              replay_size=c.replay_size,
              entropy_weight=c.entropy_weight,
              discount=c.discount,
              update_times=c.ppo_update_times,
              learning_rate=c.learning_rate)