Exemplo n.º 1
0
    def __init__(self, config, policy_config):
        """
        init relevent args
        """
        self.args = config["args"]
        self.device = config['device']
        self.obs_dim = policy_config["obs_dim"]
        self.action_space = policy_config["act_space"]
        self.act_dim = get_dim_from_space(self.action_space)
        self.hidden_size = self.args.hidden_size
        self.central_obs_dim = policy_config["cent_obs_dim"]
        self.multidiscrete = isinstance(self.action_space, MultiDiscrete)

        if self.args.prev_act_inp:
            # this is only local information so the agent can act decentralized
            self.q_network_input_dim = self.obs_dim + self.act_dim
        else:
            self.q_network_input_dim = self.obs_dim
        # Local recurrent q network for the agent
        self.q_network = AgentQFunction(self.q_network_input_dim, self.act_dim,
                                        self.args, self.device)

        self.schedule = DecayThenFlatSchedule(self.args.epsilon_start,
                                              self.args.epsilon_finish,
                                              self.args.epsilon_anneal_time,
                                              decay="linear")
Exemplo n.º 2
0
    def __init__(self, config, policy_config, discrete, train=True):

        self.config = config
        self.device = config['device']
        self.args = self.config["args"]        
        self.tau = self.args.tau
        self.lr = self.args.lr
        self.target_entropy_coef = self.args.target_entropy_coef
        self.opti_eps = self.args.opti_eps
        self.weight_decay = self.args.weight_decay
        self.prev_act_inp = self.args.prev_act_inp

        self.central_obs_dim, self.central_act_dim = policy_config["cent_obs_dim"], policy_config["cent_act_dim"]
        self.obs_space = policy_config["obs_space"]
        self.obs_dim = get_dim_from_space(self.obs_space)
        self.act_space = policy_config["act_space"]
        self.act_dim = get_dim_from_space(self.act_space)
        self.hidden_size = self.args.hidden_size
        self.discrete_action = discrete
        self.multidiscrete = isinstance(self.act_space, MultiDiscrete)

        if self.discrete_action:
            self.actor = R_DiscreteActor(self.args, self.obs_dim, self.act_dim, self.device, take_prev_action=self.prev_act_inp)
            self.target_entropy = -np.log((1.0 / self.act_dim)) * self.target_entropy_coef # slightly less than max possible entropy

        else:
            self.actor = R_GaussianActor(self.args, self.obs_dim, self.act_dim, self.act_space, self.device, take_prev_action=self.prev_act_inp)
            self.target_entropy = -torch.prod(torch.Tensor(self.act_space.shape)).item()  # max possible entropy

        self.critic = R_Critic(self.args, self.central_obs_dim, self.central_act_dim, self.device, discrete=False)
        self.target_critic = R_Critic(self.args, self.central_obs_dim, self.central_act_dim, self.device, discrete=False)
        # sync the target weights
        self.target_critic.load_state_dict(self.critic.state_dict())

        self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.lr, eps=self.opti_eps, weight_decay=self.weight_decay)
        self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.lr, eps=self.opti_eps, weight_decay=self.weight_decay)

        self.alpha = self.config["args"].alpha # will get updated via log_alpha
        self.log_alpha = torch.tensor(np.log(self.alpha), requires_grad=True)
        self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=self.lr, eps=self.opti_eps, weight_decay=self.weight_decay)
    def __init__(self, args, observation_dim, action_space):
        """
        init relevent args
        """
        self.args = args
        self.observation_dim = observation_dim
        self.action_space = action_space
        self.action_dim = get_dim_from_space(action_space)
        self.q_network_input_dim = observation_dim + (2 * self.action_dim)

        self.q_network = AgentQFunction(self.q_network_input_dim, 1, args)

        self.schedule = DecayThenFlatSchedule(args.epsilon_start,
                                              args.epsilon_finish,
                                              args.epsilon_anneal_time,
                                              decay="linear")
def main(args):
    # ray.init(local_mode=True)
    env_parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter)
    env_parser.add_argument('--map_name',
                            type=str,
                            default='3m',
                            help="Which sc env to run on")
    env_parser.add_argument('--use_available_actions',
                            action='store_false',
                            default=True,
                            help="take turn to take action")

    env_args = env_parser.parse_known_args(args)[0]

    # algorithm specific parameters
    alg_flags, alg_arg_dict = parse_args(args)

    # set seeds and # threads
    torch.manual_seed(alg_flags.seed)
    torch.cuda.manual_seed_all(alg_flags.seed)
    np.random.seed(alg_flags.seed)

    # cuda
    if alg_flags.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(1)
        if alg_flags.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(alg_flags.n_training_threads)

    # create dummy env and get relevant env info to set up policies
    env = StarCraft2Env(map_name=env_args.map_name, seed=alg_flags.seed)
    test_env = StarCraft2Env(map_name=env_args.map_name, seed=alg_flags.seed)
    buffer_length = get_map_params(env_args.map_name)["limit"]
    alg_arg_dict["n_agents"] = env.n_agents

    # setup file to output tensorboard, hyperparameters, and saved models
    # path
    model_dir = Path(
        '../results'
    ) / alg_flags.env_name / env_args.map_name / alg_flags.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [
            int(str(folder.name).split('run')[1])
            for folder in model_dir.iterdir()
            if str(folder.name).startswith('run')
        ]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    if not run_dir.exists():
        os.makedirs(str(run_dir))

    with open(str(run_dir) + '/params.json', 'w+') as fp:
        json.dump(alg_arg_dict, fp)

    _, cent_act_dim, _ = get_state_dim(env.observation_space, env.action_space)
    cent_obs_dim = get_dim_from_space(env.share_observation_space[0])

    # create policies and mapping fn
    if alg_flags.share_policy:
        policy_info = {
            'policy_0': {
                "cent_obs_dim": cent_obs_dim,
                "cent_act_dim": cent_act_dim,
                "obs_space": env.observation_space[0],
                "act_space": env.action_space[0]
            }
        }
        policy_mapping_fn = lambda id: 'policy_0'
    else:
        policy_info = {
            'policy_' + str(id): {
                "cent_obs_dim": cent_obs_dim,
                "cent_act_dim": cent_act_dim,
                "obs_space": env.observation_space[id],
                "act_space": env.action_space[id]
            }
            for id in env.agent_ids
        }
        policy_mapping_fn = lambda id: 'policy_' + str(id)

    config = {
        "args": alg_flags,
        "run_dir": run_dir,
        "policy_info": policy_info,
        "policy_mapping_fn": policy_mapping_fn,
        "env": env,
        "test_env": test_env,
        "agent_ids": env.agent_ids,
        "device": device,
        "buffer_length": buffer_length,
        "use_available_actions": env_args.use_available_actions
    }

    trainable = RMADDPGTrainable(config=config)
    test_times = (alg_flags.num_env_steps // alg_flags.test_interval) + 1
    for test_time in range(test_times):
        print(
            "\n Map {} Algo {} updates {}/{} times, total num timesteps {}/{}.\n"
            .format(env_args.map_name, alg_flags.algorithm_name, test_time,
                    test_times, trainable.total_env_steps,
                    alg_flags.num_env_steps))
        trainable.train()
    trainable.logger.export_scalars_to_json(
        str(trainable.log_dir + '/summary.json'))
    trainable.logger.close()
    env.close()
    test_env.close()
Exemplo n.º 5
0
def main(args):
    # ray.init(local_mode=True)
    env_parser = argparse.ArgumentParser(
        formatter_class=argparse.RawDescriptionHelpFormatter
    )
    env_parser.add_argument('--hanabi_name', type=str, default='Hanabi-Very-Small', help="Which env to run on")
    env_parser.add_argument('--num_players', type=int, default=2, help="number of players")
    env_parser.add_argument('--take_turn', action='store_false', default=True, help="take turn to take action")
    env_parser.add_argument('--use_cent_agent_obs', action='store_false', default=True, help="different central obs")
    env_parser.add_argument('--use_available_actions', action='store_false', default=True, help="take turn to take action")
    
    env_args = env_parser.parse_known_args(args)[0]

    # algorithm specific parameters
    alg_flags, alg_arg_dict = parse_args(args)

    # set seeds and # threads
    torch.manual_seed(alg_flags.seed)
    torch.cuda.manual_seed_all(alg_flags.seed)
    np.random.seed(alg_flags.seed)
    
    # cuda
    if alg_flags.cuda and torch.cuda.is_available():
        device = torch.device("cuda:0")
        torch.set_num_threads(1)
        if alg_flags.cuda_deterministic:
            torch.backends.cudnn.benchmark = False
            torch.backends.cudnn.deterministic = True
    else:
        device = torch.device("cpu")
        torch.set_num_threads(alg_flags.n_training_threads)

    # env for testing and warmup (contains parallel envs)
    env = HanabiEnv(env_args.hanabi_name, env_args.num_players, alg_flags.seed)
    #test_env = HanabiEnv(env_args.hanabi_name, env_args.num_players, alg_flags.seed)
    cent_obs_dim = get_dim_from_space(env.share_observation_space[0])
    alg_arg_dict["n_agents"] = env.num_agents
    alg_arg_dict["cent_obs_dim"] = cent_obs_dim

    # setup file to output tensorboard, hyperparameters, and saved models
    model_dir = Path('../results') / alg_flags.env_name / env_args.hanabi_name / alg_flags.algorithm_name
    if not model_dir.exists():
        curr_run = 'run1'
    else:
        exst_run_nums = [int(str(folder.name).split('run')[1]) for folder in model_dir.iterdir() if str(folder.name).startswith('run')]
        if len(exst_run_nums) == 0:
            curr_run = 'run1'
        else:
            curr_run = 'run%i' % (max(exst_run_nums) + 1)

    run_dir = model_dir / curr_run
    if not run_dir.exists():
        os.makedirs(str(run_dir))
    with open(str(run_dir) + '/params.json', 'w+') as fp:
        json.dump(alg_arg_dict, fp)
    
    # create policies and mapping fn
    if alg_flags.share_policy:
        policy_info = {
            'policy_0': {"cent_obs_dim": cent_obs_dim,
                         "obs_dim": get_dim_from_space(env.observation_space[0]),
                         "act_space": env.action_space[0]}
        }
        policy_mapping_fn = lambda id: 'policy_0'
    else:
        policy_info = {
            'policy_' + str(id): {"cent_obs_dim": cent_obs_dim,
                                  "obs_dim": get_dim_from_space(env.observation_space[id]),
                                  "act_space": env.action_space[id]}
            for id in env.agent_ids
        }
        policy_mapping_fn = lambda id: 'policy_' + str(id)

    config = {"args": alg_flags,
              "run_dir": run_dir, 
              "policy_info": policy_info, 
              "policy_mapping_fn": policy_mapping_fn,
              "env": env, 
              "test_env": env, 
              "agent_ids": env.agent_ids, 
              "take_turn":env_args.take_turn, 
              "use_cent_agent_obs":env_args.use_cent_agent_obs, 
              "use_available_actions":env_args.use_available_actions,
              "device": device}

    # trainable = MADDPGTrainable(config=config)
    trainable = QMixTrainable(config=config)
    test_times = (alg_flags.num_env_steps // alg_flags.test_interval) + 1
    for test_time in range(test_times):
        print("\n Hanabi {} Algo {} updates {}/{} times, total num timesteps {}/{}.\n"
                .format(env_args.hanabi_name,
                        alg_flags.algorithm_name,
                        test_time, 
                        test_times,
                        trainable.total_env_steps,
                        alg_flags.num_env_steps))
        trainable.train()
    trainable.logger.export_scalars_to_json(str(trainable.log_dir + '/summary.json'))
    trainable.logger.close()
Exemplo n.º 6
0
    def __init__(self, config, policy_config, train=True):

        self.config = config
        self.device = config['device']
        self.args = self.config["args"]
        self.tau = self.args.tau
        self.lr = self.args.lr
        self.opti_eps = self.args.opti_eps
        self.weight_decay = self.args.weight_decay
        self.prev_act_inp = self.args.prev_act_inp

        self.central_obs_dim, self.central_act_dim = policy_config[
            "cent_obs_dim"], policy_config["cent_act_dim"]
        self.obs_space = policy_config["obs_space"]
        self.obs_dim = get_dim_from_space(self.obs_space)
        self.act_space = policy_config["act_space"]
        self.act_dim = get_dim_from_space(self.act_space)
        self.hidden_size = self.args.hidden_size
        self.discrete_action = is_discrete(self.act_space)
        self.multidiscrete = isinstance(self.act_space, MultiDiscrete)

        self.actor = R_Actor(self.args,
                             self.obs_dim,
                             self.act_dim,
                             self.discrete_action,
                             self.device,
                             take_prev_action=self.prev_act_inp)
        self.critic = R_Critic(self.args, self.central_obs_dim,
                               self.central_act_dim, self.device)

        self.target_actor = R_Actor(self.args,
                                    self.obs_dim,
                                    self.act_dim,
                                    self.discrete_action,
                                    self.device,
                                    take_prev_action=self.prev_act_inp)
        self.target_critic = R_Critic(self.args, self.central_obs_dim,
                                      self.central_act_dim, self.device)
        # sync the target weights
        self.target_actor.load_state_dict(self.actor.state_dict())
        self.target_critic.load_state_dict(self.critic.state_dict())

        if train:
            self.actor_optimizer = torch.optim.Adam(
                self.actor.parameters(),
                lr=self.lr,
                eps=self.opti_eps,
                weight_decay=self.weight_decay)
            self.critic_optimizer = torch.optim.Adam(
                self.critic.parameters(),
                lr=self.lr,
                eps=self.opti_eps,
                weight_decay=self.weight_decay)

            if self.discrete_action:
                # eps greedy exploration
                self.exploration = DecayThenFlatSchedule(
                    self.args.epsilon_start,
                    self.args.epsilon_finish,
                    self.args.epsilon_anneal_time,
                    decay="linear")
            else:
                # Set to none; gaussian noise will be added in get_actions
                self.exploration = None