def __init__(self, observation_space, action_space, replay_buffer, hidden_sizes=256, critic_lr=0.001, actor_lr=0.002, batch_size=32, gamma=0.90, tau=0.01): self.observation_space = observation_space self.action_space = action_space self.replay_buffer = replay_buffer self.hidden_sizes = hidden_sizes self.critic_lr = critic_lr self.actor_lr = actor_lr self.batch_size = batch_size self.gamma = gamma self.tau = tau self.device = 'cuda:0' if torch.cuda.is_available() else 'cpu' self.critic = CriticNetwork(observation_space=observation_space, hidden_sizes=hidden_sizes).to(self.device) self.target_critic = CriticNetwork(observation_space=observation_space, hidden_sizes=hidden_sizes).to(self.device) self.critic_optim = optim.Adam(self.critic.parameters(), lr=critic_lr) hard_update(self.target_critic, self.critic) self.actor = ActorNetwork(observation_space=observation_space, action_space=action_space, hidden_sizes=hidden_sizes).to(self.device) self.target_actor = ActorNetwork(observation_space=observation_space, action_space=action_space, hidden_sizes=hidden_sizes).to(self.device) self.actor_optim = optim.Adam(self.actor.parameters(), lr=actor_lr) hard_update(self.target_actor, self.actor) self.loss_fuc = torch.nn.MSELoss()
def __init__(self, agent_init_params, sa_size, gamma=0.95, tau=0.01, attend_tau=0.002, pi_lr=0.01, q_lr=0.01, reward_scale=10., pol_hidden_dim=128, critic_hidden_dim=128, attend_heads=4, **kwargs): """ Inputs: agent_init_params (list of dict): List of dicts with parameters to initialize each agent num_in_pol (int): Input dimensions to policy num_out_pol (int): Output dimensions to policy sa_size (list of (int, int)): Size of state and action space for each agent gamma (float): Discount factor tau (float): Target update rate pi_lr (float): Learning rate for policy q_lr (float): Learning rate for critic reward_scale (float): Scaling for reward (has effect of optimal policy entropy) hidden_dim (int): Number of hidden dimensions for networks """ self.nagents = len(sa_size) self.agents = [ AttentionAgent(lr=pi_lr, hidden_dim=pol_hidden_dim, **params) for params in agent_init_params ] self.critic = AttentionCritic(sa_size, hidden_dim=critic_hidden_dim, attend_heads=attend_heads) self.target_critic = AttentionCritic(sa_size, hidden_dim=critic_hidden_dim, attend_heads=attend_heads) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.q_parameters(), lr=q_lr, weight_decay=1e-3) self.agent_init_params = agent_init_params self.gamma = gamma self.tau = tau self.attend_tau = attend_tau self.pi_lr = pi_lr self.q_lr = q_lr self.reward_scale = reward_scale self.pol_dev = 'cpu' # device for policies self.critic_dev = 'cpu' # device for critics self.trgt_pol_dev = 'cpu' # device for target policies self.trgt_critic_dev = 'cpu' # device for target critics self.niter = 0
def update_all_targets(self): """ Update all target networks (called after normal updates have been performed for each agent) """ if self.hard_update_interval is None: soft_update(self.target_critic, self.critic, self.tau) for a in self.agents: soft_update(a.target_policy, a.policy, self.tau) elif self.niter % self.hard_update_interval == 0: hard_update(self.target_critic, self.critic) for a in self.agents: hard_update(a.target_policy, a.policy)
def __init__(self, num_out_pol, hidden_dim=128, lr=0.01, onehot_dim=0): """ Inputs: num_in_pol (int): number of dimensions for policy input num_out_pol (int): number of dimensions for policy output """ self.policy = DiscretePolicy(num_out_pol, hidden_dim=hidden_dim, onehot_dim=onehot_dim) self.target_policy = DiscretePolicy(num_out_pol, hidden_dim=hidden_dim, onehot_dim=onehot_dim) hard_update(self.target_policy, self.policy) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
def __init__(self, num_in_pol, num_out_pol, hidden_dim=64, lr=0.01, onehot_dim=0): self.policy = DiscretePolicy(num_in_pol, num_out_pol, hidden_dim=hidden_dim, onehot_dim=onehot_dim) self.target_policy = DiscretePolicy(num_in_pol, num_out_pol, hidden_dim=hidden_dim, onehot_dim=onehot_dim) hard_update(self.target_policy, self.policy) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr)
def update_all_targets(self): """ Update all target networks (called after normal updates have been performed for each agent) """ if self.commonCritic: soft_update(self.target_critic, self.critic, self.tau) for a_i in range(len(self.agents)): a = self.agents[a_i] if not self.commonCritic: soft_update(a.target_critic, a.critic, self.tau) if a_i == 0: soft_update(a.target_policy, a.policy, self.tau) else: hard_update(a.policy, self.agents[0].policy) soft_update(a.target_policy, a.policy, self.tau) self.niter += 1
def __init__(self, algo_config: List[Tuple[int, int]], gamma=0.95, tau=0.01, pi_lr=0.01, q_lr=0.01, reward_scale=10., pol_hidden_dim=128, critic_hidden_dim=128, attend_heads=4, **kwargs): """ Inputs: algo_config (List[Tuple[int, int]]): Agent types which will exist in this environment Ex. [(20, 8), (20, 2)] gamma (float): Discount factor tau (float): Target update rate pi_lr (float): Learning rate for policy q_lr (float): Learning rate for critic reward_scale (float): Scaling for reward (has effect of optimal policy entropy) hidden_dim (int): Number of hidden dimensions for networks """ print(algo_config) # Dictionary which maps agent type to its topology self.agents = [AttentionAgent(sdim, adim, lr=pi_lr, hidden_dim=pol_hidden_dim) for sdim, adim in algo_config] self.critic = AttentionCritic(algo_config, hidden_dim=critic_hidden_dim, attend_heads=attend_heads) self.target_critic = AttentionCritic(algo_config, hidden_dim=critic_hidden_dim, attend_heads=attend_heads) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=q_lr, weight_decay=1e-3) self.gamma = gamma self.tau = tau self.pi_lr = pi_lr self.q_lr = q_lr self.reward_scale = reward_scale self.pol_dev = 'cpu' # device for policies self.critic_dev = 'cpu' # device for critics self.trgt_pol_dev = 'cpu' # device for target policies self.trgt_critic_dev = 'cpu' # device for target critics self.niter = 0 self.init_dict = {'gamma': gamma, 'tau': tau, 'pi_lr': pi_lr, 'q_lr': q_lr, 'reward_scale': reward_scale, 'pol_hidden_dim': pol_hidden_dim, 'critic_hidden_dim': critic_hidden_dim, 'attend_heads': attend_heads, 'algo_config': algo_config}
def make_moa(self, hidden_dim=64, lr=0.01, rnn_policy=False, norm_in=False, constrain_out=False, env_obs_space=None, env_act_space=None): """ instantiate a policy, target and optimizer for training each of the other agents, assume current agent always have position 0 in env obs and act spaces """ self.moa_policies = {} self.moa_target_policies = {} self.moa_optimizers = {} self.moa_hidden_states = {} for i in range(1, len(env_act_space)): obs_space, act_space = env_obs_space[i], env_act_space[i] num_in_pol = obs_space.shape[0] if isinstance(act_space, Dict): # hard specify now, could generalize later num_out_pol = { "move": self.get_shape(act_space, "move"), "comm": self.get_shape(act_space, "comm") } else: num_out_pol = self.get_shape(act_space) policy_kwargs = dict(hidden_dim=hidden_dim, norm_in=norm_in, constrain_out=constrain_out, discrete_action=self.discrete_action, rnn_policy=rnn_policy) policy = Policy(num_in_pol, num_out_pol, **policy_kwargs) target_policy = Policy(num_in_pol, num_out_pol, **policy_kwargs) hard_update(target_policy, policy) # push to moa containers self.moa_policies[i] = policy self.moa_target_policies[i] = target_policy self.moa_optimizers[i] = Adam(policy.parameters(), lr=lr) self.moa_hidden_states[i] = None
def __init__(self, agent_init_params, sa_size, gamma=0.95, tau=0.01, pi_lr=0.01, q_lr=0.01, reward_scale=10., pol_hidden_dim=128, critic_hidden_dim=128, attend_heads=4, **kwargs): """ Inputs: agent_init_params (list of dict): List of dicts with parameters to initialize each agent num_in_pol (int): Input dimensions to policy num_out_pol (int): Output dimensions to policy sa_size (list of (int, int)): Size of state and action space for each agent """ self.nagents = len(sa_size) self.agents = [ AttentionAgent(lr=pi_lr, hidden_dim=pol_hidden_dim, **params) for params in agent_init_params ] self.critic = AttentionCritic(sa_size, hidden_dim=critic_hidden_dim, attend_heads=attend_heads) self.target_critic = AttentionCritic(sa_size, hidden_dim=critic_hidden_dim, attend_heads=attend_heads) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=q_lr, weight_decay=1e-3) self.agent_init_params = agent_init_params self.gamma = gamma self.tau = tau self.pi_lr = pi_lr self.q_lr = q_lr self.reward_scale = reward_scale self.niter = 0
def __init__(self, num_in_pol, num_out_pol, num_in_critic, hidden_dim=64, lr=0.01, discrete_action=True): """ Inputs: num_in_pol (int): number of dimensions for policy input num_out_pol (int): number of dimensions for policy output num_in_critic (int): number of dimensions for critic input """ self.policy = MLPNetwork(num_in_pol, num_out_pol, hidden_dim=hidden_dim, constrain_out=True, discrete_action=discrete_action) self.critic = MLPNetwork(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) self.target_policy = MLPNetwork(num_in_pol, num_out_pol, hidden_dim=hidden_dim, constrain_out=True, discrete_action=discrete_action) self.target_critic = MLPNetwork(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) hard_update(self.target_policy, self.policy) hard_update(self.target_critic, self.critic) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr) if not discrete_action: self.exploration = OUNoise(num_out_pol) else: self.exploration = 0.3 # epsilon for eps-greedy self.discrete_action = discrete_action
def __init__(self, obs_shape, action_size, hidden_dim=64, lr=0.01, adam_eps=1e-8, nonlin=F.relu, n_pol_heads=1): self.policy = DiscretePolicy(obs_shape, action_size, hidden_dim=hidden_dim, nonlin=nonlin, n_heads=n_pol_heads) self.target_policy = DiscretePolicy(obs_shape, action_size, hidden_dim=hidden_dim, nonlin=nonlin, n_heads=n_pol_heads) hard_update(self.target_policy, self.policy) self.policy_optimizer = Adam(self.policy.parameters(), lr=lr, eps=adam_eps)
def __init__(self, nagents, obs_shape, state_shape, action_size, gamma_e=0.95, gamma_i=0.95, tau=0.01, hard_update_interval=None, pi_lr=0.01, q_lr=0.01, phi_lr=0.1, adam_eps=1e-8, q_decay=1e-3, phi_decay=1e-4, reward_scale=10., head_reward_scale=25., pol_hidden_dim=64, critic_hidden_dim=64, nonlin=F.relu, n_intr_rew_types=0, sep_extr_head=False, beta=0.5, **kwargs): """ Inputs: obs_shape (int): Dimensions of vector observations state_shape (int): Dimensions of vector global state gamma (float): Discount factor tau (float): Target update rate pi_lr (float): Learning rate for policy q_lr (float): Learning rate for critic reward_scale (float): Scaling for reward (has effect of optimal policy entropy) hidden_dim (int): Number of hidden dimensions for networks """ self.nagents = nagents n_pol_heads = n_intr_rew_types + int(sep_extr_head) self.agents = [ Agent(obs_shape, action_size, lr=pi_lr, adam_eps=adam_eps, hidden_dim=pol_hidden_dim, nonlin=nonlin, n_pol_heads=n_pol_heads) for _ in range(nagents) ] self.critic = CentralCritic(state_shape[0], action_size, nagents, hidden_dim=critic_hidden_dim, n_intr_rew_heads=n_intr_rew_types, sep_extr_head=sep_extr_head) self.target_critic = CentralCritic(state_shape[0], action_size, nagents, hidden_dim=critic_hidden_dim, n_intr_rew_heads=n_intr_rew_types, sep_extr_head=sep_extr_head) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=q_lr, eps=adam_eps, weight_decay=q_decay) self.gamma_e = gamma_e self.gamma_i = gamma_i self.tau = tau self.hard_update_interval = hard_update_interval self.pi_lr = pi_lr self.q_lr = q_lr self.reward_scale = reward_scale self.head_reward_scale = head_reward_scale self.n_intr_rew_types = n_intr_rew_types self.sep_extr_head = sep_extr_head # separate policy head only trained on extr rews self.n_pol_heads = n_pol_heads self.beta = beta self.head_selector = HeadSelector(nagents, n_pol_heads) self.head_selector_optimizer = SGD(self.head_selector.parameters(), lr=phi_lr, weight_decay=phi_decay) self.curr_pol_heads = self.sample_pol_heads() self.grad_norm_clip = 10 self.niter = 0
def __init__(self, agent_init_params, alg_types, gamma=0.95, tau=0.01, lr=0.01, hidden_dim=64, discrete_action=False, stochastic = False, commonCritic = False, gasil = False, dlr = 0.0003, lambda_disc = 0.5, batch_size_disc = 512, dynamic = False): """ Inputs: agent_init_params (list of dict): List of dicts with parameters to initialize each agent num_in_pol (int): Input dimensions to policy num_out_pol (int): Output dimensions to policy num_in_critic (int): Input dimensions to critic alg_types (list of str): Learning algorithm for each agent (DDPG or MADDPG) gamma (float): Discount factor tau (float): Target update rate lr (float): Learning rate for policy and critic hidden_dim (int): Number of hidden dimensions for networks discrete_action (bool): Whether or not to use discrete action space """ self.nagents = len(alg_types) self.alg_types = alg_types self.agents = [DDPGAgent(lr=lr, discrete_action=discrete_action, hidden_dim=hidden_dim, **params) for params in agent_init_params] for i in self.agents: i.target_policy.requires_grad = False self.agent_init_params = agent_init_params self.gamma = gamma self.tau = tau self.lr = lr self.dlr = dlr self.discrete_action = discrete_action self.pol_dev = 'cpu' # device for policies self.critic_dev = 'cpu' # device for critics self.trgt_pol_dev = 'cpu' # device for target policies self.trgt_critic_dev = 'cpu' # device for target critics self.disc_dev = 'cpu' self.niter = 0 self.stochastic = stochastic self.commonCritic = commonCritic self.gasil = gasil self.lambda_disc = lambda_disc self.batch_size_disc = batch_size_disc self.dynamic = dynamic num_in_critic = self.agent_init_params[0]['num_in_critic'] self.cuda = True if torch.cuda.is_available() else False if self.commonCritic: #num_in_discriminator = self.agent_init_params[0]['num_in_pol'] + self.agent_init_params[0]['num_out_pol'] #This can be changed and looked at self.critic = MLPNetwork(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) self.target_critic = MLPNetwork(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr) if self.gasil: self.discriminator = MLPNetwork_Disc(num_in_critic, 1, hidden_dim=hidden_dim, norm_in=False, constrain_out=False, discrete_action=False) self.discriminator_optimizer = Adam(self.discriminator.parameters(), lr=dlr)
def __init__(self, algo_type="MADDPG", act_space=None, obs_space=None, rnn_policy=False, rnn_critic=False, hidden_dim=64, lr=0.01, norm_in=False, constrain_out=False, env_obs_space=None, env_act_space=None, model_of_agents=False, **kwargs): """ Inputs: act_space: single agent action space (single space or Dict) obs_space: single agent observation space (single space Dict) """ self.algo_type = algo_type self.act_space = act_space self.obs_space = obs_space # continuous or discrete action (only look at `move` action, assume # move and comm space both discrete or continuous) tmp = act_space.spaces["move"] if isinstance(act_space, Dict) else act_space self.discrete_action = False if isinstance(tmp, Box) else True # Exploration noise if not self.discrete_action: # `move`, `comm` share same continuous noise source self.exploration = OUNoise(self.get_shape(act_space)) else: self.exploration = 0.3 # epsilon for eps-greedy # Policy (supports multiple outputs) self.rnn_policy = rnn_policy self.policy_hidden_states = None num_in_pol = obs_space.shape[0] if isinstance(act_space, Dict): # hard specify now, could generalize later num_out_pol = { "move": self.get_shape(act_space, "move"), "comm": self.get_shape(act_space, "comm") } else: num_out_pol = self.get_shape(act_space) policy_kwargs = dict(hidden_dim=hidden_dim, norm_in=norm_in, constrain_out=constrain_out, discrete_action=self.discrete_action, rnn_policy=rnn_policy) self.policy = Policy(num_in_pol, num_out_pol, **policy_kwargs) self.target_policy = Policy(num_in_pol, num_out_pol, **policy_kwargs) hard_update(self.target_policy, self.policy) # action selector (distribution wrapper) if self.discrete_action: self.selector = DiscreteActionSelector() else: self.selector = ContinuousActionSelector() # Critic self.rnn_critic = rnn_critic self.critic_hidden_states = None if algo_type == "MADDPG": num_in_critic = 0 for oobsp in env_obs_space: num_in_critic += oobsp.shape[0] for oacsp in env_act_space: # feed all acts to centralized critic num_in_critic += self.get_shape(oacsp) else: # only DDPG, local critic num_in_critic = obs_space.shape[0] + self.get_shape(act_space) critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork critic_kwargs = dict(hidden_dim=hidden_dim, norm_in=norm_in, constrain_out=constrain_out) self.critic = critic_net_fn(num_in_critic, 1, **critic_kwargs) self.target_critic = critic_net_fn(num_in_critic, 1, **critic_kwargs) hard_update(self.target_critic, self.critic) # Optimizers self.policy_optimizer = Adam(self.policy.parameters(), lr=lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr) # model of agents (approximate models for other agents) self.model_of_agents = model_of_agents if model_of_agents: self.make_moa(hidden_dim=hidden_dim, lr=lr, rnn_policy=rnn_policy, norm_in=norm_in, constrain_out=constrain_out, env_obs_space=env_obs_space, env_act_space=env_act_space)
def __init__(self, algo_type="MADDPG", act_space=None, obs_space=None, rnn_policy=False, rnn_critic=False, hidden_dim=64, lr=0.01, norm_in=False, constrain_out=False, thought_dim=64, env_obs_space=None, env_act_space=None, **kwargs): """ Inputs: act_space: single agent action space (single space or Dict) obs_space: single agent observation space (single space Dict) """ self.algo_type = algo_type self.act_space = act_space self.obs_space = obs_space # continuous or discrete action (only look at `move` action, assume # move and comm space both discrete or continuous) tmp = act_space.spaces["move"] if isinstance(act_space, Dict) else act_space self.discrete_action = False if isinstance(tmp, Box) else True # Exploration noise if not self.discrete_action: # `move`, `comm` share same continuous noise source self.exploration = OUNoise(self.get_shape(act_space)) else: self.exploration = 0.3 # epsilon for eps-greedy # Policy (supports multiple outputs) self.rnn_policy = rnn_policy self.policy_hidden_states = None num_in_pol = obs_space.shape[0] if isinstance(act_space, Dict): # hard specify now, could generalize later num_out_pol = { "move": self.get_shape(act_space, "move"), "comm": self.get_shape(act_space, "comm") } else: num_out_pol = self.get_shape(act_space) # atoc policy policy_kwargs = dict( hidden_dim=hidden_dim, norm_in=norm_in, constrain_out=constrain_out, discrete_action=self.discrete_action, rnn_policy=rnn_policy, thought_dim=thought_dim ) self.policy = ATOCPolicy(num_in_pol, num_out_pol, **policy_kwargs) self.target_policy = ATOCPolicy(num_in_pol, num_out_pol, **policy_kwargs) hard_update(self.target_policy, self.policy) # Critic self.rnn_critic = rnn_critic self.critic_hidden_states = None if algo_type == "MADDPG": num_in_critic = 0 for oobsp in env_obs_space: num_in_critic += oobsp.shape[0] for oacsp in env_act_space: # feed all acts to centralized critic num_in_critic += self.get_shape(oacsp) else: # only DDPG, local critic num_in_critic = obs_space.shape[0] + self.get_shape(act_space) critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork critic_kwargs = dict( hidden_dim=hidden_dim, norm_in=norm_in, constrain_out=constrain_out ) self.critic = critic_net_fn(num_in_critic, 1, **critic_kwargs) self.target_critic = critic_net_fn(num_in_critic, 1, **critic_kwargs) hard_update(self.target_critic, self.critic) # NOTE: atoc modules # attention unit, MLP (used here) or RNN, output comm probability self.thought_dim = thought_dim self.attention_unit = nn.Sequential( MLPNetwork(thought_dim, 1, hidden_dim=hidden_dim, norm_in=norm_in, constrain_out=False), nn.Sigmoid() ) # communication channel, bi-LSTM (used here) or graph self.comm_channel = nn.LSTM(thought_dim, thought_dim, 1, batch_first=False, bidirectional=True) # Optimizers self.policy_optimizer = Adam(self.policy.parameters(), lr=lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr) self.attention_unit_optimizer = Adam(self.attention_unit.parameters(), lr=lr) self.comm_channel_optimizer = Adam(self.comm_channel.parameters(), lr=lr)
def __init__(self, algo_type="MADDPG", act_space=None, obs_space=None, rnn_policy=False, rnn_critic=False, hidden_dim=64, lr=0.01, env_obs_space=None, env_act_space=None): """ Inputs: act_space: single agent action space (single space or Dict) obs_space: single agent observation space (single space Dict) """ self.algo_type = algo_type self.act_space = act_space self.obs_space = obs_space # continuous or discrete action (only look at `move` action, assume # move and comm space both discrete or continuous) if isinstance(act_space, Box) or isinstance(act_space["move"], Box): discrete_action = False elif isinstance(act_space, Discrete) or isinstance( act_space["move"], Discrete): discrete_action = True self.discrete_action = discrete_action # Exploration noise if not discrete_action: # `move`, `comm` share same continuous noise source self.exploration = OUNoise(self.get_shape(act_space)) else: self.exploration = 0.3 # epsilon for eps-greedy # Policy (supports multiple outputs) self.rnn_policy = rnn_policy self.policy_hidden_states = None num_in_pol = obs_space.shape[0] if isinstance(act_space, Dict): # hard specify now, could generalize later num_out_pol = { "move": self.get_shape(act_space, "move"), "comm": self.get_shape(act_space, "comm") } else: num_out_pol = self.get_shape(act_space) self.policy = Policy(num_in_pol, num_out_pol, hidden_dim=hidden_dim, constrain_out=True, discrete_action=discrete_action, rnn_policy=rnn_policy) self.target_policy = Policy(num_in_pol, num_out_pol, hidden_dim=hidden_dim, constrain_out=True, discrete_action=discrete_action, rnn_policy=rnn_policy) hard_update(self.target_policy, self.policy) # Critic self.rnn_critic = rnn_critic self.critic_hidden_states = None if algo_type == "MADDPG": num_in_critic = 0 for oobsp in env_observation_space: num_in_critic += oobsp.shape[0] for oacsp in env_action_space: # feed all acts to centralized critic num_in_critic += self.get_shape(oacsp) else: # only DDPG, local critic num_in_critic = obs_space.shape[0] + self.get_shape(act_space) critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork self.critic = critic_net_fn(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) self.target_critic = critic_net_fn(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) hard_update(self.target_critic, self.critic) # Optimizers self.policy_optimizer = Adam(self.policy.parameters(), lr=lr) self.critic_optimizer = Adam(self.critic.parameters(), lr=lr)
def __init__(self, agent_init_params, sa_size, gamma=0.95, tau=0.01, pi_lr=0.01, q_lr=0.01, reward_scale=10., pol_hidden_dim=128, critic_hidden_dim=128, attend_heads=4, l1_reg=0.01, **kwargs): """ Inputs: agent_init_params (list of dict): List of dicts with parameters to initialize each agent num_in_pol (int): Input dimensions to policy num_out_pol (int): Output dimensions to policy sa_size (list of (int, int)): Size of state and action space for each agent gamma (float): Discount factor tau (float): Target update rate pi_lr (float): Learning rate for policy q_lr (float): Learning rate for critic reward_scale (float): Scaling for reward (has effect of optimal policy entropy) hidden_dim (int): Number of hidden dimensions for networks """ self.nagents = len(sa_size) self.l1_reg = l1_reg self.agents = [ AttentionAgent(lr=pi_lr, hidden_dim=pol_hidden_dim, **params) for params in agent_init_params ] widths = [] hidden_layers = [] running_width = 0 for sdim, adim in sa_size: totdim = sdim + adim running_width += totdim widths.append(running_width) hidden_layers.append(2) # def __init__(self, sa_sizes, widths, hidden_layers, selector_width, selector_depth, hidden_dim=32, **kwargs): self.critic = SelectiveAttentionCritic(sa_size, widths=widths, hidden_layers=hidden_layers, selector_width=2, selector_depth=2) self.target_critic = SelectiveAttentionCritic( sa_size, widths=widths, hidden_layers=hidden_layers, selector_width=2, selector_depth=2) hard_update(self.target_critic, self.critic) self.critic_optimizer = Adam(self.critic.parameters(), lr=q_lr, weight_decay=1e-3) self.agent_init_params = agent_init_params self.gamma = gamma self.tau = tau self.pi_lr = pi_lr self.q_lr = q_lr self.reward_scale = reward_scale self.pol_dev = 'cpu' # device for policies self.critic_dev = 'cpu' # device for critics self.trgt_pol_dev = 'cpu' # device for target policies self.trgt_critic_dev = 'cpu' # device for target critics self.niter = 0
def __init__(self, algo_type="MASAC", act_space=None, obs_space=None, rnn_policy=False, rnn_critic=False, hidden_dim=64, lr=0.01, env_obs_space=None, env_act_space=None, **kwargs): """ Inputs: act_space: single agent action space (single space or Dict) obs_space: single agent observation space (single space Dict) """ self.algo_type = algo_type self.act_space = act_space self.obs_space = obs_space # continuous or discrete action (only look at `move` action, assume # move and comm space both discrete or continuous) tmp = act_space.spaces["move"] if isinstance(act_space, Dict) else act_space self.discrete_action = False if isinstance(tmp, Box) else True # Policy (supports multiple outputs) self.rnn_policy = rnn_policy self.policy_hidden_states = None num_in_pol = obs_space.shape[0] if isinstance(act_space, Dict): # hard specify now, could generalize later num_out_pol = { "move": self.get_shape(act_space, "move"), "comm": self.get_shape(act_space, "comm") } else: num_out_pol = self.get_shape(act_space) self.policy = Policy(num_in_pol, num_out_pol, hidden_dim=hidden_dim, # constrain_out=True, discrete_action=self.discrete_action, rnn_policy=rnn_policy) # action selector (distribution wrapper) if self.discrete_action: self.selector = DiscreteActionSelector() else: self.selector = ContinuousActionSelector() # Critic self.rnn_critic = rnn_critic self.critic_hidden_states = None if algo_type == "MASAC": num_in_critic = 0 for oobsp in env_obs_space: num_in_critic += oobsp.shape[0] for oacsp in env_act_space: # feed all acts to centralized critic num_in_critic += self.get_shape(oacsp) else: # only DDPG, local critic num_in_critic = obs_space.shape[0] + self.get_shape(act_space) critic_net_fn = RecurrentNetwork if rnn_critic else MLPNetwork self.critic1 = critic_net_fn(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) self.critic2 = critic_net_fn(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) self.target_critic1 = critic_net_fn(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) self.target_critic2 = critic_net_fn(num_in_critic, 1, hidden_dim=hidden_dim, constrain_out=False) hard_update(self.target_critic1, self.critic1) hard_update(self.target_critic2, self.critic2) # alpha self.log_alpha = LogAlpha(0.0) # Optimizers self.policy_optimizer = Adam(self.policy.parameters(), lr=lr) self.critic1_optimizer = Adam(self.critic1.parameters(), lr=lr) self.critic2_optimizer = Adam(self.critic2.parameters(), lr=lr) self.alpha_optimizer = Adam((self.log_alpha,), lr=lr)