class QtranAlt: def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape rnn_input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: rnn_input_shape += self.n_actions # 当前agent的上一个动作的one_hot向量 if args.reuse_network: rnn_input_shape += self.n_agents self.args = args # 神经网络 self.eval_rnn = RNN(rnn_input_shape, args) # individual networks self.target_rnn = RNN(rnn_input_shape, args) self.eval_joint_q = QtranQAlt(args) # counterfactual joint networks self.target_joint_q = QtranQAlt(args) self.v = QtranV(args) if self.args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.eval_joint_q.cuda() self.target_joint_q.cuda() self.v.cuda() self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map # 如果存在模型则加载模型 if self.args.load_model: if os.path.exists(self.model_dir + '/rnn_net_params.pkl'): path_rnn = self.model_dir + '/rnn_net_params.pkl' path_joint_q = self.model_dir + '/joint_q_params.pkl' path_v = self.model_dir + '/v_params.pkl' map_location = 'cuda:0' if self.args.cuda else 'cpu' self.eval_rnn.load_state_dict( torch.load(path_rnn, map_location=map_location)) self.eval_joint_q.load_state_dict( torch.load(path_joint_q, map_location=map_location)) self.v.load_state_dict( torch.load(path_v, map_location=map_location)) print('Successfully load the model: {}, {} and {}'.format( path_rnn, path_joint_q, path_v)) else: raise Exception("No model!") # 让target_net和eval_net的网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_joint_q.load_state_dict(self.eval_joint_q.state_dict()) self.eval_parameters = list(self.eval_joint_q.parameters()) + \ list(self.v.parameters()) + \ list(self.eval_rnn.parameters()) if args.optimizer == "RMS": self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr) # 执行过程中,要为每个agent都维护一个eval_hidden # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden、target_hidden self.eval_hidden = None self.target_hidden = None print('Init alg QTRAN-alt') def learn(self, batch, max_episode_len, train_step, epsilon=None): # train_step表示是第几次学习,用来控制更新target_net网络的参数 ''' 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode,然后一次给神经网络 传入每个episode的同一个位置的transition ''' episode_num = batch['o'].shape[0] self.init_hidden(episode_num) for key in batch.keys(): # 把batch里的数据转化成tensor if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) s, s_next, u, r, avail_u, avail_u_next, terminated = batch['s'], batch['s_next'], batch['u'], \ batch['r'], batch['avail_u'], batch['avail_u_next'],\ batch['terminated'] mask = 1 - batch["padded"].float().repeat( 1, 1, self.n_agents) # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 if self.args.cuda: u = u.cuda() r = r.cuda() avail_u = avail_u.cuda() avail_u_next = avail_u_next.cuda() terminated = terminated.cuda() mask = mask.cuda() # 得到每个agent对应的Q和hidden_states,维度为(episode个数, max_episode_len, n_agents, n_actions/hidden_dim) individual_q_evals, individual_q_targets, hidden_evals, hidden_targets = self._get_individual_q( batch, max_episode_len) # 得到当前时刻和下一时刻每个agent的局部最优动作及其one_hot表示 individual_q_clone = individual_q_evals.clone() individual_q_clone[avail_u == 0.0] = -999999 individual_q_targets[avail_u_next == 0.0] = -999999 opt_onehot_eval = torch.zeros(*individual_q_clone.shape) opt_action_eval = individual_q_clone.argmax(dim=3, keepdim=True) opt_onehot_eval = opt_onehot_eval.scatter(-1, opt_action_eval[:, :].cpu(), 1) opt_onehot_target = torch.zeros(*individual_q_targets.shape) opt_action_target = individual_q_targets.argmax(dim=3, keepdim=True) opt_onehot_target = opt_onehot_target.scatter( -1, opt_action_target[:, :].cpu(), 1) # ---------------------------------------------L_td------------------------------------------------------------- # 计算joint_q和v,要注意joint_q是每个agent都有,v只有一个 # joint_q的维度为(episode个数, max_episode_len, n_agents, n_actions), 而且joint_q在后面的l_nopt还要用到 # v的维度为(episode个数, max_episode_len) joint_q_evals, joint_q_targets, v = self.get_qtran( batch, opt_onehot_target, hidden_evals, hidden_targets) # 取出当前agent动作对应的joint_q_chosen以及它的局部最优动作对应的joint_q joint_q_chosen = torch.gather(joint_q_evals, dim=-1, index=u).squeeze( -1) # (episode个数, max_episode_len, n_agents) joint_q_opt = torch.gather(joint_q_targets, dim=-1, index=opt_action_target).squeeze(-1) # loss y_dqn = r.repeat(1, 1, self.n_agents) + self.args.gamma * joint_q_opt * ( 1 - terminated.repeat(1, 1, self.n_agents)) td_error = joint_q_chosen - y_dqn.detach() l_td = ((td_error * mask)**2).sum() / mask.sum() # ---------------------------------------------L_td------------------------------------------------------------- # ---------------------------------------------L_opt------------------------------------------------------------ # 将局部最优动作的Q值相加 (episode个数,max_episode_len) # 这里要使用individual_q_clone,它把不能执行的动作Q值改变了,使用individual_q_evals可能会使用不能执行的动作的Q值 q_sum_opt = individual_q_clone.max(dim=-1)[0].sum(dim=-1) # 重新得到joint_q_opt_eval,它和joint_q_evals的区别是前者输入的动作是当前局部最优动作,后者输入的动作是当前执行的动作 joint_q_opt_evals, _, _ = self.get_qtran(batch, opt_onehot_eval, hidden_evals, hidden_targets, hat=True) joint_q_opt_evals = torch.gather( joint_q_opt_evals, dim=-1, index=opt_action_eval).squeeze( -1) # (episode个数, max_episode_len, n_agents) # 因为QTRAN-alt要对每个agent都计算l_opt,所以要把q_sum_opt和v再增加一个agent维 q_sum_opt = q_sum_opt.unsqueeze(-1).expand(-1, -1, self.n_agents) v = v.unsqueeze(-1).expand(-1, -1, self.n_agents) opt_error = q_sum_opt - joint_q_opt_evals.detach( ) + v # 计算l_opt时需要将joint_q_opt_evals固定 l_opt = ((opt_error * mask)**2).sum() / mask.sum() # ---------------------------------------------L_opt------------------------------------------------------------ # ---------------------------------------------L_nopt----------------------------------------------------------- # 因为L_nopt约束的是当前agent所有可执行的动作中,对应的最小的d,为了让不能执行的动作不影响d的计算,将不能执行的动作对应的q变大 individual_q_evals[avail_u == 0.0] = 999999 # 得到agent_i之外的其他agent的执行动作的Q值之和q_other_sum # 1. 先得到每个agent的执行动作的Q值q_all,(episode个数, max_episode_len, n_agents, 1) q_all_chosen = torch.gather(individual_q_evals, dim=-1, index=u) # 2. 把q_all最后一个维度上当前agent的Q值变成所有agent的Q值,(episode个数, max_episode_len, n_agents, n_agents) q_all_chosen = q_all_chosen.view((episode_num, max_episode_len, 1, -1)).repeat(1, 1, self.n_agents, 1) q_mask = (1 - torch.eye(self.n_agents)).unsqueeze(0).unsqueeze(0) if self.args.cuda: q_mask = q_mask.cuda() q_other_chosen = q_all_chosen * q_mask # 把每个agent自己的Q值置为0,从而才能相加得到其他agent的Q值之和 # 3. 求和,同时由于对于当前agent的每个动作,都要和q_other_sum相加,所以把q_other_sum扩展出n_actions维度 q_other_sum = q_other_chosen.sum(dim=-1, keepdim=True).repeat( 1, 1, 1, self.n_actions) # 当前agent的每个动作的Q和其他agent执行动作的Q相加,得到D中的第一项 q_sum_nopt = individual_q_evals + q_other_sum # 因为joint_q_evals的维度是(episode个数,max_episode_len,n_agents,n_actions),所以要对v扩展出一个n_actions维度 v = v.unsqueeze(-1).expand(-1, -1, -1, self.n_actions) d = q_sum_nopt - joint_q_evals.detach( ) + v # 计算l_nopt时需要将qtran_q_evals固定 d = d.min(dim=-1)[0] l_nopt = ((d * mask)**2).sum() / mask.sum() # ---------------------------------------------L_nopt----------------------------------------------------------- # print('l_td is {}, l_opt is {}, l_nopt is {}'.format(l_td, l_opt, l_nopt)) loss = l_td + self.args.lambda_opt * l_opt + self.args.lambda_nopt * l_nopt # loss = l_td + self.args.lambda_opt * l_opt self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.grad_norm_clip) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_cycle == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_joint_q.load_state_dict(self.eval_joint_q.state_dict()) def _get_individual_q(self, batch, max_episode_len): episode_num = batch['o'].shape[0] q_evals, q_targets, hidden_evals, hidden_targets = [], [], [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_individual_inputs( batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() self.eval_hidden = self.eval_hidden.cuda() inputs_next = inputs_next.cuda() self.target_hidden = self.target_hidden.cuda() q_eval, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) q_target, self.target_hidden = self.target_rnn( inputs_next, self.target_hidden) hidden_eval, hidden_target = self.eval_hidden.clone( ), self.target_hidden.clone() # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_target = q_target.view(episode_num, self.n_agents, -1) hidden_eval = hidden_eval.view(episode_num, self.n_agents, -1) hidden_target = hidden_target.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) hidden_evals.append(hidden_eval) hidden_targets.append(hidden_target) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) hidden_evals = torch.stack(hidden_evals, dim=1) hidden_targets = torch.stack(hidden_targets, dim=1) return q_evals, q_targets, hidden_evals, hidden_targets def _get_individual_inputs(self, batch, transition_idx): # 取出所有episode上该transition_idx的经验,u_onehot要取出所有,因为要用到上一条 obs, obs_next, u_onehot = batch['o'][:, transition_idx], \ batch['o_next'][:, transition_idx], batch['u_onehot'][:] episode_num = obs.shape[0] inputs, inputs_next = [], [] inputs.append(obs) inputs_next.append(obs_next) # 给obs添加上一个动作、agent编号 if self.args.last_action: if transition_idx == 0: # 如果是第一条经验,就让前一个动作为0向量 inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) inputs_next.append(u_onehot[:, transition_idx]) if self.args.reuse_network: # 因为当前的obs三维的数据,每一维分别代表(episode编号,agent编号,obs维度),直接在dim_1上添加对应的向量 # 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的 # agent编号恰好就是一个单位矩阵,即对角线为1,其余为0 inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) inputs_next.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) # 要把obs中的三个拼起来,并且要把episode_num个episode、self.args.n_agents个agent的数据拼成40条(40,96)的数据, # 因为这里所有agent共享一个神经网络,每条数据中带上了自己的编号,所以还是自己的数据 inputs = torch.cat( [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) inputs_next = torch.cat([ x.reshape(episode_num * self.args.n_agents, -1) for x in inputs_next ], dim=1) return inputs, inputs_next def get_qtran(self, batch, local_opt_actions, hidden_evals, hidden_targets=None, hat=False): episode_num, max_episode_len, _, _ = hidden_evals.shape s = batch['s'][:, :max_episode_len] s_next = batch['s_next'][:, :max_episode_len] u_onehot = batch['u_onehot'][:, :max_episode_len] v_state = s.clone() # s和s_next没有n_agents维度,每个agent的joint_q网络都需要, 所以要把s转化成四维 s = s.unsqueeze(-2).expand(-1, -1, self.n_agents, -1) s_next = s_next.unsqueeze(-2).expand(-1, -1, self.n_agents, -1) # 添加agent编号对应的one-hot向量 ''' 因为当前的inputs三维的数据,每一维分别代表(episode编号,agent编号,inputs维度),直接在后面添加对应的向量 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的 agent编号恰好就是一个单位矩阵,即对角线为1,其余为0 ''' action_onehot = torch.eye( self.n_agents).unsqueeze(0).unsqueeze(0).expand( episode_num, max_episode_len, -1, -1) s_eval = torch.cat([s, action_onehot], dim=-1) s_target = torch.cat([s_next, action_onehot], dim=-1) if self.args.cuda: s_eval = s_eval.cuda() s_target = s_target.cuda() v_state = v_state.cuda() u_onehot = u_onehot.cuda() hidden_evals = hidden_evals.cuda() hidden_targets = hidden_targets.cuda() local_opt_actions = local_opt_actions.cuda() if hat: # 神经网络输出的q_eval、q_target的维度为(episode_num * max_episode_len * n_agents, n_actions) q_evals = self.eval_joint_q(s_eval, hidden_evals, local_opt_actions) q_targets = None v = None # 把q_eval维度变回(episode_num, max_episode_len, n_agents, n_actions) q_evals = q_evals.view(episode_num, max_episode_len, -1, self.n_actions) else: q_evals = self.eval_joint_q(s_eval, hidden_evals, u_onehot) q_targets = self.target_joint_q(s_target, hidden_targets, local_opt_actions) v = self.v(v_state, hidden_evals) # 把q_eval、q_target维度变回(episode_num, max_episode_len, n_agents, n_actions) q_evals = q_evals.view(episode_num, max_episode_len, -1, self.n_actions) q_targets = q_targets.view(episode_num, max_episode_len, -1, self.n_actions) # 把v维度变回(episode_num, max_episode_len) v = v.view(episode_num, -1) return q_evals, q_targets, v def init_hidden(self, episode_num): # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden self.eval_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) def save_model(self, train_step): num = str(train_step // self.args.save_cycle) if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) torch.save(self.eval_rnn.state_dict(), self.model_dir + '/' + num + '_rnn_net_params.pkl') torch.save(self.eval_joint_q.state_dict(), self.model_dir + '/' + num + '_joint_q_params.pkl') torch.save(self.v.state_dict(), self.model_dir + '/' + num + '_v_params.pkl')
class VDN: def __init__(self, args): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: input_shape += self.n_actions if args.reuse_network: input_shape += self.n_agents # 神经网络 self.eval_rnn = RNN(input_shape, args) # 每个agent选动作的网络 self.target_rnn = RNN(input_shape, args) self.eval_vdn_net = VDNNet() # 把agentsQ值加起来的网络 self.target_vdn_net = VDNNet() self.args = args if self.args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.eval_vdn_net.cuda() self.target_vdn_net.cuda() self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map # 如果存在模型则加载模型 if self.args.load_model: if os.path.exists(self.model_dir + '/rnn_net_params.pkl'): path_rnn = self.model_dir + '/rnn_net_params.pkl' path_vdn = self.model_dir + '/vdn_net_params.pkl' map_location = 'cuda:0' if self.args.cuda else 'cpu' self.eval_rnn.load_state_dict( torch.load(path_rnn, map_location=map_location)) self.eval_vdn_net.load_state_dict( torch.load(path_vdn, map_location=map_location)) print('Successfully load the model: {} and {}'.format( path_rnn, path_vdn)) else: raise Exception("No model!") # 让target_net和eval_net的网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_vdn_net.load_state_dict(self.eval_vdn_net.state_dict()) self.eval_parameters = list(self.eval_vdn_net.parameters()) + list( self.eval_rnn.parameters()) if args.optimizer == "RMS": self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr) # 执行过程中,要为每个agent都维护一个eval_hidden # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden、target_hidden self.eval_hidden = None self.target_hidden = None print('Init alg VDN') def learn(self, batch, max_episode_len, train_step, epsilon=None): # train_step表示是第几次学习,用来控制更新target_net网络的参数 ''' 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode,然后一次给神经网络 传入每个episode的同一个位置的transition ''' episode_num = batch['o'].shape[0] self.init_hidden(episode_num) for key in batch.keys(): # 把batch里的数据转化成tensor if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) # TODO pymarl中取得经验没有取最后一条,找出原因 u, r, avail_u, avail_u_next, terminated = batch['u'], batch['r'], batch['avail_u'], \ batch['avail_u_next'], batch['terminated'] mask = 1 - batch["padded"].float() # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 if self.args.cuda: u = u.cuda() r = r.cuda() mask = mask.cuda() terminated = terminated.cuda() # 得到每个agent对应的Q值,维度为(episode个数, max_episode_len, n_agents,n_actions) q_evals, q_targets = self.get_q_values(batch, max_episode_len) # 取每个agent动作对应的Q值,并且把最后不需要的一维去掉,因为最后一维只有一个值了 q_evals = torch.gather(q_evals, dim=3, index=u).squeeze(3) # 得到target_q q_targets[avail_u_next == 0.0] = -9999999 q_targets = q_targets.max(dim=3)[0] q_total_eval = self.eval_vdn_net(q_evals) q_total_target = self.target_vdn_net(q_targets) targets = r + self.args.gamma * q_total_target * (1 - terminated) td_error = targets.detach() - q_total_eval masked_td_error = mask * td_error # 抹掉填充的经验的td_error # loss = masked_td_error.pow(2).mean() # 不能直接用mean,因为还有许多经验是没用的,所以要求和再比真实的经验数,才是真正的均值 loss = (masked_td_error**2).sum() / mask.sum() # print('Loss is ', loss) self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.grad_norm_clip) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_cycle == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_vdn_net.load_state_dict(self.eval_vdn_net.state_dict()) def _get_inputs(self, batch, transition_idx): # 取出所有episode上该transition_idx的经验,u_onehot要取出所有,因为要用到上一条 obs, obs_next, u_onehot = batch['o'][:, transition_idx], \ batch['o_next'][:, transition_idx], batch['u_onehot'][:] episode_num = obs.shape[0] inputs, inputs_next = [], [] inputs.append(obs) inputs_next.append(obs_next) # 给obs添加上一个动作、agent编号 if self.args.last_action: if transition_idx == 0: # 如果是第一条经验,就让前一个动作为0向量 inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) inputs_next.append(u_onehot[:, transition_idx]) if self.args.reuse_network: # 因为当前的obs三维的数据,每一维分别代表(episode,agent,obs维度),直接在dim_1上添加对应的向量 # 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的 # agent编号恰好就是一个单位矩阵,即对角线为1,其余为0 inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) inputs_next.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) # 要把obs中的三个拼起来,并且要把episode_num个episode、self.args.n_agents个agent的数据拼成episode_num*n_agents条数据 # 因为这里所有agent共享一个神经网络,每条数据中带上了自己的编号,所以还是自己的数据 inputs = torch.cat( [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) inputs_next = torch.cat([ x.reshape(episode_num * self.args.n_agents, -1) for x in inputs_next ], dim=1) return inputs, inputs_next def get_q_values(self, batch, max_episode_len): episode_num = batch['o'].shape[0] q_evals, q_targets = [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs( batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() q_eval, self.eval_hidden = self.eval_rnn( inputs, self.eval_hidden ) # 得到的q_eval维度为(episode_num*n_agents, n_actions) q_target, self.target_hidden = self.target_rnn( inputs_next, self.target_hidden) # 把q_eval维度重新变回(episode_num, n_agents, n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_target = q_target.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) return q_evals, q_targets def init_hidden(self, episode_num): # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden self.eval_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) def save_model(self, train_step): num = str(train_step // self.args.save_cycle) if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) torch.save(self.eval_vdn_net.state_dict(), self.model_dir + '/' + num + '_vdn_net_params.pkl') torch.save(self.eval_rnn.state_dict(), self.model_dir + '/' + num + '_rnn_net_params.pkl')
class LIIR: def __init__(self, args): self.n_agents = args.n_agents self.n_actions = args.n_actions self.obs_shape = args.obs_shape actor_input_shape = self.obs_shape critic_input_shape = self._get_critic_input_shape() # Actor Network (RNN) if args.last_action: actor_input_shape += self.n_actions if args.reuse_networks: actor_input_shape += self.n_agents self.critic = LiirNetwork(critic_input_shape, args) self.target_critic = LiirNetwork(critic_input_shape, args) self.eval_rnn = RNN(actor_input_shape, args) self.target_rnn = RNN(actor_input_shape, args) print('Init Algo LIIR') if args.use_cuda and torch.cuda.is_available(): self.device = torch.device("cuda:0") self.eval_rnn = self.eval_rnn.to(self.device) self.target_rnn = self.target_rnn.to(self.device) self.critic = self.critic.to(self.device) self.target_critic = self.target_critic.to(self.device) else: self.device = torch.device("cpu") self.target_critic.load_state_dict(self.critic.state_dict()) self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.agent_params = list(self.eval_rnn.parameters()) self.critic_params = list(self.critic.fc1.parameters()) + list(self.critic.fc2.parameters()) + \ list(self.critic.fc3_v_mix.parameters()) self.intrinsic_params = list(self.critic.fc3_r_in.parameters()) + list(self.critic.fc4.parameters()) self.agent_optimizer = Adam(params=self.agent_params, lr=args.actor_lr) self.critic_optimizer = Adam(params=self.critic_params, lr=args.critic_lr) self.intrinsic_optimizer = Adam(params=self.intrinsic_params, lr=args.critic_lr) # The timing of updating the target networks is controlled by the LIIR itself ('train_step' is unnecessary) self.last_target_update_step = 0 self.critic_training_steps = 0 self.args = args self.eval_hidden = None self.target_hidden = None def learn(self, batch, max_episode_len, train_step, epsilon): bs = batch['o'].shape[0] max_t = batch['o'].shape[1] for key in batch.keys(): if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) rewards, actions, terminated = batch['r'][:, :-1], batch['u'][:, :], batch['terminated'][:, :-1] q_vals, target_mix, target_ex, v_ex, r_in = self._train_critic(batch, rewards, terminated, actions) actions = actions[:, :-1] # (bs, max_t, n_agents, shape) # ------------------ Calculate policy grad -------------------------- self.init_hidden(bs) mac_out = self._get_eval_action_prob(batch, max_t, epsilon)[:, :-1] # (bs, max_t - 1, n_agents, n_actions) mac_out = mac_out / mac_out.sum(dim=-1, keepdim=True) q_vals = q_vals.reshape(-1, 1) # (bs * max_t - 1 * n_agents, 1) pi = mac_out.view(-1, self.n_actions) pi_taken = torch.gather(pi, dim=1, index=actions.reshape(-1, 1)).squeeze(1) log_pi_taken = torch.log(pi_taken) # (bs * max_t - 1 * n_agents * 1) advantages = (target_mix.reshape(-1, 1) - q_vals).detach() advantages = (advantages - advantages.mean()) / (advantages.std() + 1e-8) agent_loss = -(advantages * log_pi_taken).mean() self.agent_optimizer.zero_grad() agent_loss.backward(retain_graph=True) self.agent_optimizer.step() # ---------------- Intrinsic loss Optimizer -------------------------- v_ex_loss = ((v_ex - target_ex.detach()) ** 2).view(-1, 1).mean() # ------- pg1 -------- self.init_hidden(bs) mac_out_old = self._get_target_action_prob(batch, max_t, epsilon)[:, :-1] mac_out_old = mac_out_old / mac_out_old.sum(dim=-1, keepdim=True) pi_old = mac_out_old.view(-1, self.n_actions) pi_old_taken = torch.gather(pi_old, dim=1, index=actions.reshape(-1, 1)).squeeze(1) log_pi_old_taken = torch.log(pi_old_taken) # (bs * max_t - 1 * n_agents * 1) log_pi_old_taken = log_pi_old_taken.reshape(-1, self.n_agents) # (bs * max_t - 1, n_agents * 1) # ------- pg2 -------- self.init_hidden(bs) mac_out_new = self._get_eval_action_prob(batch, max_t, epsilon)[:, :-1] mac_out_new = mac_out_new / mac_out_new.sum(dim=-1, keepdim=True) pi_new = mac_out_new.view(-1, self.n_actions) pi_new_taken = torch.gather(pi_new, dim=1, index=actions.reshape(-1, 1)).squeeze(1) log_pi_new_taken = torch.log(pi_new_taken) log_pi_new_taken = log_pi_new_taken.reshape(-1, self.n_agents) # (bs * max_t - 1, n_agents * 1) neglogpac_new = - log_pi_new_taken.sum(-1) # (bs * max_t - 1, 1) pi2 = log_pi_taken.reshape(-1, self.n_agents).sum(-1).clone() ratio_new = torch.exp(- pi2 - neglogpac_new) adv_ex = (target_ex - v_ex.detach()).detach() adv_ex = (adv_ex - adv_ex.mean()) / (adv_ex.std() + 1e-8) # _______ gradient for pg 1 and 2--- pg_loss1 = log_pi_old_taken.view(-1, 1).mean() pg_loss2 = (adv_ex.view(-1) * ratio_new).mean() self.target_rnn.zero_grad() pg_loss1_grad = torch.autograd.grad(pg_loss1, list(self.target_rnn.parameters())) self.eval_rnn.zero_grad() pg_loss2_grad = torch.autograd.grad(pg_loss2, list(self.eval_rnn.parameters())) total_grad = 0 for grad1, grad2 in zip(pg_loss1_grad, pg_loss2_grad): total_grad += (grad1 * grad2).sum() target_mix = target_mix.reshape(-1, max_t - 1, self.n_agents) pg_ex_loss = (total_grad.detach() * target_mix).mean() intrinsic_loss = pg_ex_loss + v_ex_loss self.intrinsic_optimizer.zero_grad() intrinsic_loss.backward() self.intrinsic_optimizer.step() self._update_policy_old() if (self.critic_training_steps - self.last_target_update_step) / self.args.target_update_cycle >= 1.0: self._update_target() self.last_target_update_step = self.critic_training_steps def _train_critic(self, batch, rewards, terminated, actions): bs = batch['o'].shape[0] max_t = batch['o'].shape[1] inputs = self._get_critic_inputs(batch) if self.args.use_cuda: inputs.to(self.device) _, target_vals, target_val_ex = self.target_critic(inputs) r_in, _, target_val_ex_opt = self.critic(inputs) r_in_taken = torch.gather(r_in, dim=3, index=actions) r_in = r_in_taken.squeeze(-1) # (bs, max_t, n_agents * 1) target_vals = target_vals.squeeze(-1) # (bs, max_t, n_agents * 1) # Here, <rewards> <terminated> --> (bs, max_t-1, 1) # And <target_vals> <target_val_ex> <r_in> --> (bs, max_t, n_agents, 1) targets_mix, targets_ex = build_td_lambda_targets(rewards, terminated, target_vals, r_in, target_val_ex, self.args.gamma, self.args.td_lambda) vals_mix = torch.zeros_like(target_vals)[:, :-1] vals_ex = target_val_ex_opt[:, :-1] for t in reversed(range(rewards.size(1))): inputs_t = self._get_critic_inputs(batch, transition_idx=t) # (bs, 1, n_agents, shape) if self.args.use_cuda: inputs_t.to(self.device) _, q_t, _ = self.critic(inputs_t) # (bs, 1, n_agents, 1) vals_mix[:, t] = q_t.view(bs, self.n_agents) # (bs, n_agents * 1) targets_t = targets_mix[:, t] td_error_loss = (q_t.view(bs, self.n_agents) - targets_t.detach()) loss = (td_error_loss ** 2).mean() self.critic_optimizer.zero_grad() loss.backward() self.critic_optimizer.step() self.critic_training_steps += 1 # Here, vals_mix, vals_ex --> (bs, max_t-1, n_agents * 1) # And, targets_mix, targets_ex --> (bs, max_t-1, n_agents * 1) return vals_mix, targets_mix, targets_ex, vals_ex, r_in def _get_critic_inputs(self, batch, transition_idx=None): """ :param transition_idx: if transition_idx is None, slice(None) makes the whole steps into the critic inputs if transition_idx is not None, ts represent the single time-step Because the LIIR Critic Network didn't choose the GRU Network, so the steps data could be used together """ bs = batch['o'].shape[0] max_t = self.args.episode_limit if transition_idx is None else 1 ts = slice(None) if transition_idx is None else slice(transition_idx, transition_idx + 1) obs, u_onehot = batch['o'][:, ts], batch['u_onehot'][:, ts] inputs = [] # States states = obs.view((bs, max_t, 1, -1)).repeat(1, 1, self.n_agents, 1) inputs.append(states) # Actions (joint actions masked out by each agent) actions = u_onehot.view((bs, max_t, 1, -1)).repeat(1, 1, self.n_agents, 1) agent_mask = (1 - torch.eye(self.n_agents)) agent_mask = agent_mask.view(-1, 1).repeat(1, self.n_actions).view(self.n_agents, -1) inputs.append(actions * agent_mask.unsqueeze(0).unsqueeze(0)) # Agent id inputs.append(torch.eye(self.n_agents).unsqueeze(0).unsqueeze(0).expand(bs, max_t, -1, -1)) inputs = torch.cat([x.reshape(bs, max_t, self.n_agents, -1) for x in inputs], dim=-1) return inputs def _get_actor_inputs(self, batch, transition_idx): # LIIR use the policy_new and policy_old to calculate the action probability respectively # So the policy doesn't need the obs_next obs, u_onehot = batch['o'][:, transition_idx], batch['u_onehot'][:] bs = obs.shape[0] # Observation inputs = [obs] if self.args.last_action: if transition_idx == 0: inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) if self.args.reuse_networks: inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(bs, -1, -1)) # Since the using of GRU network, the inputs shape should be shaped as 2 dimensions inputs = torch.cat([x.reshape(bs * self.args.n_agents, -1) for x in inputs], dim=1) return inputs def _get_eval_action_prob(self, batch, max_episode_len, epsilon): bs = batch['o'].shape[0] # The available actions for each agent. In MPE, an agent could choose every action at any time-step. avail_actions = torch.ones_like(batch['u_onehot']) # (bs, episode_limit, n_agents, n_actions) --> all 1 action_prob = [] for transition_idx in range(max_episode_len): inputs = self._get_actor_inputs(batch, transition_idx) if self.args.use_cuda: inputs = inputs.to(self.device) self.eval_hidden = self.eval_hidden.to(self.device) outputs, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) # (bs * n_agents, n_actions) outputs = outputs.view(bs, self.n_agents, -1) prob = torch.nn.functional.softmax(outputs, dim=-1) action_prob.append(prob) action_prob = torch.stack(action_prob, dim=1).cpu() # (bs, episode_limit, n_agents, n_actions) actions_num = avail_actions.sum(dim=-1, keepdim=True).float().repeat(1, 1, 1, avail_actions.shape[-1]) action_prob = (1 - epsilon) * action_prob + torch.ones_like(action_prob) * epsilon / actions_num action_prob = action_prob / action_prob.sum(dim=-1, keepdim=True) if self.args.use_cuda: action_prob = action_prob.to(self.device) return action_prob def _get_target_action_prob(self, batch, max_episode_len, epsilon): bs = batch['o'].shape[0] # The available actions for each agent. In MPE, an agent could choose every action at any time-step. avail_actions = torch.ones_like(batch['u_onehot']) # (bs, episode_limit, n_agents, n_actions) --> all 1 action_prob = [] for transition_idx in range(max_episode_len): inputs = self._get_actor_inputs(batch, transition_idx) if self.args.use_cuda: inputs = inputs.to(self.device) self.target_hidden = self.eval_hidden.to(self.device) outputs, self.target_hidden = self.target_rnn(inputs, self.eval_hidden) # (bs * n_agents, n_actions) outputs = outputs.view(bs, self.n_agents, -1) prob = torch.nn.functional.softmax(outputs, dim=-1) action_prob.append(prob) action_prob = torch.stack(action_prob, dim=1).cpu() # (bs, episode_limit, n_agents, n_actions) actions_num = avail_actions.sum(dim=-1, keepdim=True).float().repeat(1, 1, 1, avail_actions.shape[-1]) action_prob = (1 - epsilon) * action_prob + torch.ones_like(action_prob) * epsilon / actions_num action_prob = action_prob / action_prob.sum(dim=-1, keepdim=True) if self.args.use_cuda: action_prob = action_prob.to(self.device) return action_prob def _get_critic_input_shape(self): # State (concatenation of all agents' local observations) input_shape = self.obs_shape * self.n_agents # agent_id input_shape += self.n_agents # Critic Network needs current action and last action infos (default without last actions) # Joint actions (without last actions) input_shape += self.n_actions * self.n_agents return input_shape def init_hidden(self, batch_size): self.eval_hidden = torch.zeros((batch_size, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros((batch_size, self.n_agents, self.args.rnn_hidden_dim)) def _update_policy_old(self): self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) def _update_target(self): self.target_critic.load_state_dict(self.critic.state_dict()) def get_params(self): return {'policy_new': self.eval_rnn.state_dict(), 'critic': self.critic.state_dict()} def load_params(self, params_dict): self.eval_rnn.load_state_dict(params_dict['policy_new']) self.critic.load_state_dict(params_dict['critic']) self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_critic.load_state_dict(self.critic.state_dict())
class MAAC: def __init__(self, args): self.n_agents = args.n_agents self.n_actions = args.n_actions self.obs_shape = args.obs_shape actor_input_shape = self.obs_shape # Actor Network (RNN) if args.last_action: actor_input_shape += self.n_actions if args.reuse_networks: actor_input_shape += self.n_agents self.eval_rnn = RNN(actor_input_shape, args) print('Init Algo MAAC') self.eval_critic = MaacCritic(args) self.target_critic = MaacCritic(args) if args.use_cuda and torch.cuda.is_available(): self.device = torch.device("cuda:0") self.eval_rnn.to(self.device) self.eval_critic.to(self.device) self.target_critic.to(self.device) else: self.device = torch.device("cpu") self.target_critic.load_state_dict(self.eval_critic.state_dict()) self.rnn_parameters = list(self.eval_rnn.parameters()) self.critic_parameters = list(self.eval_critic.parameters()) self.critic_optimizer = torch.optim.Adam(self.critic_parameters, lr=args.critic_lr) self.rnn_optimizer = torch.optim.Adam(self.rnn_parameters, lr=args.actor_lr) self.args = args self.loss_func = torch.nn.MSELoss() self.eval_hidden = None def learn(self, batch, max_episode_len, train_step, epsilon): bs = batch['o'].shape[0] self.init_hidden(bs) for key in batch.keys(): if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) u = batch['u'] if self.args.use_cuda: u = u.to(self.device) critic_rets = self._train_critic(batch, train_step) q_taken, q_values = [], [] for a_i, (q_eval, q_all, regs) in zip(range(self.n_agents), critic_rets): q_taken.append(q_eval) q_values.append(q_all) q_taken = torch.stack(q_taken, dim=2).squeeze(3) q_values = torch.stack(q_values, dim=2) action_prob = self._get_action_prob(batch, max_episode_len, epsilon) pi_taken = torch.gather(action_prob, dim=3, index=u).squeeze(3) log_pi_taken = torch.log(pi_taken) # Advantage for actor(policy) optimization baseline = (q_values * action_prob).sum(dim=3, keepdim=True).squeeze(3).detach() advantage = (q_taken - baseline).detach() loss = - (advantage * log_pi_taken).mean() self.rnn_optimizer.zero_grad() disable_gradients(self.eval_critic) loss.backward() enable_gradients(self.eval_critic) torch.nn.utils.clip_grad_norm_(self.rnn_parameters, self.args.grad_norm_clip) self.rnn_optimizer.step() def _train_critic(self, batch, train_step): r, terminated = batch['r'], batch['terminated'] if self.args.use_cuda: r = r.to(self.device) terminated = terminated.to(self.device) critic_in, target_critic_in = self._get_critic_inputs(batch) q_targets = self.target_critic(target_critic_in) critic_rets = self.eval_critic(critic_in, return_all_q=True, regularize=True) q_loss = 0 for a_i, q_next, (q_eval, q_all, regs) in zip(range(self.n_agents), q_targets, critic_rets): target = r + self.args.gamma * q_next * (1 - terminated) q_loss += self.loss_func(target, q_eval) for reg in regs: q_loss += reg self.critic_optimizer.zero_grad() q_loss.backward() self.eval_critic.scale_shared_grads() torch.nn.utils.clip_grad_norm_(self.eval_critic.parameters(), self.args.grad_norm_clip * self.n_agents) self.critic_optimizer.step() if train_step > 0 and train_step % self.args.target_update_cycle == 0: self.target_critic.load_state_dict(self.eval_critic.state_dict()) return critic_rets def _get_action_prob(self, batch, max_episode_len, epsilon): bs = batch['o'].shape[0] # The available actions for each agent. In MPE, an agent could choose every action at any time-step. avail_actions = torch.ones_like(batch['u_onehot']) # (bs, episode_limit, n_agents, n_actions) --> all 1 action_prob = [] for transition_idx in range(max_episode_len): inputs = self._get_actor_inputs(batch, transition_idx) if self.args.use_cuda: inputs = inputs.to(self.device) self.eval_hidden = self.eval_hidden.to(self.device) outputs, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) # outputs:(bs * n_agents, n_actions) outputs = outputs.view(bs, self.n_agents, -1) prob = torch.nn.functional.softmax(outputs, dim=-1) action_prob.append(prob) action_prob = torch.stack(action_prob, dim=1).cpu() # (bs, episode_limit, n_agents, n_actions) actions_num = avail_actions.sum(dim=-1, keepdim=True).float().repeat(1, 1, 1, avail_actions.shape[-1]) action_prob = (1 - epsilon) * action_prob + torch.ones_like(action_prob) * epsilon / actions_num action_prob = action_prob / action_prob.sum(dim=-1, keepdim=True) if self.args.use_cuda: action_prob = action_prob.to(self.device) return action_prob def _get_actor_inputs(self, batch, transition_idx): # Because the rnn agent actor network didn't initialize a target network, it requires none next infos obs, u_onehot = batch['o'][:, transition_idx], batch['u_onehot'][:] bs = obs.shape[0] # Observation inputs = [obs] if self.args.last_action: if transition_idx == 0: inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) if self.args.reuse_networks: inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(bs, -1, -1)) # Since the using of GRU network, the inputs shape should be shaped as 2 dimensions inputs = torch.cat([x.reshape(bs * self.args.n_agents, -1) for x in inputs], dim=1) return inputs def _get_critic_inputs(self, batch): """ The MAAC algorithm handle the critic inputs with total steps (without transition_idx) """ obs, obs_next = batch['o'], batch['o_next'] # (bs, episode_limit, n_agents, obs_shape) u_onehot = batch['u_onehot'] # (bs, episode_limit, n_agents, n_actions) u_onehot_next = u_onehot[:, 1:] # (bs, episode_limit - 1, n_agents, n_actions) padded_next = torch.zeros(*u_onehot[:, -1].shape, dtype=torch.float32).unsqueeze(1) # Add a step with zeros u_onehot_next = torch.cat((u_onehot_next, padded_next), dim=1) if self.args.use_cuda: obs = obs.to(self.device) obs_next = obs_next.to(self.device) u_onehot = u_onehot.to(self.device) u_onehot_next = u_onehot_next.to(self.device) agents_obs, agents_obs_next = [], [] agents_u, agents_u_next = [], [] for a_i in range(self.n_agents): agent_obs, agent_obs_next = obs[:, :, a_i], obs_next[:, :, a_i] # (bs, episode_limit, obs_shape) agent_u, agent_u_next = u_onehot[:, :, a_i], u_onehot_next[:, :, a_i] # (bs, episode_limit, n_actions) agents_obs.append(agent_obs) agents_obs_next.append(agent_obs_next) agents_u.append(agent_u) agents_u_next.append(agent_u_next) target_critic_in = list(zip(agents_obs_next, agents_u_next)) critic_in = list(zip(agents_obs, agents_u)) return critic_in, target_critic_in def init_hidden(self, batch_size): self.eval_hidden = torch.zeros((batch_size, self.n_agents, self.args.rnn_hidden_dim)) def get_params(self): return {'eval_critic': self.eval_critic.state_dict(), 'eval_rnn': self.eval_rnn.state_dict()} def load_params(self, params_dict): # Get parameters from save_dict self.eval_rnn.load_state_dict(params_dict['eval_rnn']) self.eval_critic.load_state_dict(params_dict['eval_critic']) # Copy the eval networks to target networks self.target_critic.load_state_dict(self.target_critic.state_dict())
class QMIX: def __init__(self, args): self.n_agents = args.n_agents self.n_actions = args.n_actions self.obs_shape = args.obs_shape input_shape = self.obs_shape if args.last_action: input_shape += self.n_actions if args.reuse_networks: input_shape += self.n_agents self.eval_rnn = RNN(input_shape, args) self.target_rnn = RNN(input_shape, args) state_shape = self.obs_shape * self.n_agents self.eval_qmix_net = QMIXNet(state_shape, args) self.target_qmix_net = QMIXNet(state_shape, args) self.args = args if args.use_cuda and torch.cuda.is_available(): self.device = torch.device("cuda:0") self.eval_rnn.to(self.device) self.target_rnn.to(self.device) self.eval_qmix_net.to(self.device) self.target_qmix_net.to(self.device) else: self.device = torch.device("cpu") self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_qmix_net.load_state_dict(self.eval_qmix_net.state_dict()) self.eval_parameters = list(self.eval_qmix_net.parameters()) + list( self.eval_rnn.parameters()) self.optimizer = torch.optim.Adam(self.eval_parameters, lr=args.lr) self.eval_hidden = None self.target_hidden = None print('Init algo QMIX') def learn(self, batch, max_episode_len, train_step, epsilon=None): bs = batch['o'].shape[0] self.init_hidden(bs) for key in batch.keys(): if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) o, o_next, u, r, terminated = batch['o'], batch['o_next'], batch[ 'u'], batch['r'], batch['terminated'] if self.args.use_cuda and torch.cuda.is_available(): o = o.to(self.device) o_next = o_next.to(self.device) u = u.to(self.device) r = r.to(self.device) terminated = terminated.to(self.device) q_evals, q_targets = self._get_q_values(batch, max_episode_len) q_evals = torch.gather(q_evals, dim=3, index=u).squeeze(3) q_targets = q_targets.max(dim=3)[0] # qmix algorithm needs the total states infos to calculate the mixer network distribution # In the MPE, envs don't support the original states infos due to the Complete Observable (Not the POMDP) # So the state can be seen as the concatenation of all the agents' observations states = o.reshape((bs, self.args.episode_limit, -1)) states_next = o_next.reshape((bs, self.args.episode_limit, -1)) q_total_eval = self.eval_qmix_net(q_evals, states) q_total_target = self.target_qmix_net(q_targets, states_next) targets = r + self.args.gamma * q_total_target * (1 - terminated) td_error = targets.detach() - q_total_eval loss = td_error.pow(2).mean() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.grad_norm_clip) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_cycle == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_qmix_net.load_state_dict( self.eval_qmix_net.state_dict()) def _get_q_values(self, batch, max_episode_len): bs = batch['o'].shape[0] q_evals, q_targets = [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs(batch, transition_idx) if self.args.use_cuda: inputs = inputs.to(self.device) inputs_next = inputs_next.to(self.device) self.eval_hidden = self.eval_hidden.to(self.device) self.target_hidden = self.target_hidden.to(self.device) q_eval, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) q_target, self.target_hidden = self.target_rnn( inputs_next, self.target_hidden) q_eval = q_eval.view(bs, self.n_agents, -1) q_target = q_target.view(bs, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) return q_evals, q_targets def _get_inputs(self, batch, transition_idx): obs, obs_next, u_onehot = batch['o'][:, transition_idx], \ batch['o_next'][:, transition_idx], batch['u_onehot'][:] bs = obs.shape[0] inputs, inputs_next = [], [] inputs.append(obs) inputs_next.append(obs_next) if self.args.last_action: if transition_idx == 0: inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) inputs_next.append(u_onehot[:, transition_idx]) if self.args.reuse_networks: inputs.append( torch.eye(self.n_agents).unsqueeze(0).expand(bs, -1, -1)) inputs_next.append( torch.eye(self.n_agents).unsqueeze(0).expand(bs, -1, -1)) inputs = torch.cat([x.reshape(bs * self.n_agents, -1) for x in inputs], dim=1) inputs_next = torch.cat( [x.reshape(bs * self.n_agents, -1) for x in inputs_next], dim=1) return inputs, inputs_next def init_hidden(self, batch_size): self.eval_hidden = torch.zeros( (batch_size, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros( (batch_size, self.n_agents, self.args.rnn_hidden_dim)) def get_params(self): return { 'eval_rnn': self.eval_rnn.state_dict(), 'eval_qmix_net': self.eval_qmix_net.state_dict() } def load_params(self, params_dict): # Get parameters from save_dict self.eval_rnn.load_state_dict(params_dict['eval_rnn']) self.eval_qmix_net.load_state_dict(params_dict['eval_qmix_net']) # Copy the eval networks to target networks self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_qmix_net.load_state_dict(self.eval_qmix_net.state_dict())
class QTran: def __init__(self, args, itr): self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape self.args = args rnn_input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: rnn_input_shape += self.n_actions # 当前agent的上一个动作的one_hot向量 if args.reuse_network: rnn_input_shape += self.n_agents # 神经网络 self.eval_rnn = RNN(rnn_input_shape, args) # 每个agent选动作的网络 self.target_rnn = RNN(rnn_input_shape, args) # self.eval_joint_q = QtranQBase(args) # Joint action-value network # self.target_joint_q = QtranQBase(args) # 默认值分解算法使用 qtran_base if self.args.alg == 'qtran_base': self.eval_joint_q = QtranQBase(args) self.target_joint_q = QtranQBase(args) elif self.args.alg == 'qtran_alt': # self.mixer = QTranAlt(args) # self.target_mixer = QTranAlt(args) raise Exception("Not supported yet.") else: raise Exception('QTRAN只有qtran_base!') self.v = QtranV(args) if self.args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.eval_joint_q.cuda() self.target_joint_q.cuda() self.v.cuda() # self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map + '/' + str(itr) # self.model_dir = args.model_dir + '/' + args.map + '/' + args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '/' + str(itr) # 如果存在模型则加载模型 if self.args.load_model: if os.path.exists(self.args.model_dir + '/rnn_net_params.pkl'): path_rnn = self.args.model_dir + '/rnn_net_params.pkl' path_joint_q = self.args.model_dir + '/joint_q_params.pkl' path_v = self.args.model_dir + '/v_params.pkl' map_location = 'cuda:0' if self.args.cuda else 'cpu' self.eval_rnn.load_state_dict(torch.load(path_rnn, map_location=map_location)) self.eval_joint_q.load_state_dict(torch.load(path_joint_q, map_location=map_location)) self.v.load_state_dict(torch.load(path_v, map_location=map_location)) print('Successfully load the model: {}, {} and {}'.format(path_rnn, path_joint_q, path_v)) else: raise Exception("No model!") # 让target_net和eval_net的网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_joint_q.load_state_dict(self.eval_joint_q.state_dict()) self.eval_parameters = list(self.eval_joint_q.parameters()) + \ list(self.v.parameters()) + \ list(self.eval_rnn.parameters()) if args.optim == "RMS": self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr) # 执行过程中,要为每个agent都维护一个eval_hidden # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden、target_hidden self.eval_hidden = None self.target_hidden = None print('Init alg QTRAN-base') def learn(self, batch, max_episode_len, train_step, epsilon=None): # train_step表示是第几次学习,用来控制更新target_net网络的参数 ''' 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode,然后一次给神经网络 传入每个episode的同一个位置的transition ''' episode_num = batch['o'].shape[0] self.init_hidden(episode_num) # for key in batch.keys(): # 把batch里的数据转化成tensor # if key == 'u': # batch[key] = torch.tensor(batch[key], dtype=torch.long) # else: # batch[key] = torch.tensor(batch[key], dtype=torch.float32) # 数据转为tensor for key in batch.keys(): if key == 'a': batch[key] = torch.LongTensor(batch[key]) else: batch[key] = torch.Tensor(batch[key]) u, r, avail_u, avail_u_next, terminated = batch['a'], batch['r'], batch['avail_a'], \ batch['next_avail_a'], batch['done'] mask = (1 - batch["padded"].float()).squeeze(-1) # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 if self.args.cuda: u = u.cuda() r = r.cuda() avail_u = avail_u.cuda() avail_u_next = avail_u_next.cuda() terminated = terminated.cuda() mask = mask.cuda() # 得到每个agent对应的Q和hidden_states,维度为(episode个数, max_episode_len, n_agents, n_actions/hidden_dim) individual_q_evals, individual_q_targets, hidden_evals, hidden_targets = self._get_individual_q(batch, max_episode_len) # 得到当前时刻和下一时刻每个agent的局部最优动作及其one_hot表示 individual_q_clone = individual_q_evals.clone() individual_q_clone[avail_u == 0.0] = - 999999 individual_q_targets[avail_u_next == 0.0] = - 999999 opt_onehot_eval = torch.zeros(*individual_q_clone.shape) opt_action_eval = individual_q_clone.argmax(dim=3, keepdim=True) opt_onehot_eval = opt_onehot_eval.scatter(-1, opt_action_eval[:, :].cpu(), 1) opt_onehot_target = torch.zeros(*individual_q_targets.shape) opt_action_target = individual_q_targets.argmax(dim=3, keepdim=True) opt_onehot_target = opt_onehot_target.scatter(-1, opt_action_target[:, :].cpu(), 1) # ---------------------------------------------L_td------------------------------------------------------------- # 计算joint_q和v # joint_q、v的维度为(episode个数, max_episode_len, 1), 而且joint_q在后面的l_nopt还要用到 joint_q_evals, joint_q_targets, v = self.get_qtran(batch, hidden_evals, hidden_targets, opt_onehot_target) # loss y_dqn = r.squeeze(-1) + self.args.gamma * joint_q_targets * (1 - terminated.squeeze(-1)) td_error = joint_q_evals - y_dqn.detach() l_td = ((td_error * mask) ** 2).sum() / mask.sum() # ---------------------------------------------L_td------------------------------------------------------------- # ---------------------------------------------L_opt------------------------------------------------------------ # 将局部最优动作的Q值相加 # 这里要使用individual_q_clone,它把不能执行的动作Q值改变了,使用individual_q_evals可能会使用不能执行的动作的Q值 q_sum_opt = individual_q_clone.max(dim=-1)[0].sum(dim=-1) # (episode个数, max_episode_len) # 重新得到joint_q_hat_opt,它和joint_q_evals的区别是前者输入的动作是局部最优动作,后者输入的动作是执行的动作 # (episode个数, max_episode_len) joint_q_hat_opt, _, _ = self.get_qtran(batch, hidden_evals, hidden_targets, opt_onehot_eval, hat=True) opt_error = q_sum_opt - joint_q_hat_opt.detach() + v # 计算l_opt时需要将joint_q_hat_opt固定 l_opt = ((opt_error * mask) ** 2).sum() / mask.sum() # ---------------------------------------------L_opt------------------------------------------------------------ # ---------------------------------------------L_nopt----------------------------------------------------------- # 每个agent的执行动作的Q值,(episode个数, max_episode_len, n_agents, 1) q_individual = torch.gather(individual_q_evals, dim=-1, index=u).squeeze(-1) q_sum_nopt = q_individual.sum(dim=-1) # (episode个数, max_episode_len) nopt_error = q_sum_nopt - joint_q_evals.detach() + v # 计算l_nopt时需要将joint_q_evals固定 nopt_error = nopt_error.clamp(max=0) l_nopt = ((nopt_error * mask) ** 2).sum() / mask.sum() # ---------------------------------------------L_nopt----------------------------------------------------------- # print('l_td is {}, l_opt is {}, l_nopt is {}'.format(l_td, l_opt, l_nopt)) loss = l_td + self.args.lambda_opt * l_opt + self.args.lambda_nopt * l_nopt # loss = l_td + self.args.lambda_opt * l_opt self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.clip_norm) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_period == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_joint_q.load_state_dict(self.eval_joint_q.state_dict()) def _get_individual_q(self, batch, max_episode_len): episode_num = batch['o'].shape[0] q_evals, q_targets, hidden_evals, hidden_targets = [], [], [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_individual_inputs(batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() # 要用第一条经验把target网络的hidden_state初始化好,直接用第二条经验传入target网络不对 if transition_idx == 0: _, self.target_hidden = self.target_rnn(inputs, self.eval_hidden) q_eval, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) # inputs维度为(40,96),得到的q_eval维度为(40,n_actions) q_target, self.target_hidden = self.target_rnn(inputs_next, self.target_hidden) hidden_eval, hidden_target = self.eval_hidden.clone(), self.target_hidden.clone() # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_target = q_target.view(episode_num, self.n_agents, -1) hidden_eval = hidden_eval.view(episode_num, self.n_agents, -1) hidden_target = hidden_target.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) hidden_evals.append(hidden_eval) hidden_targets.append(hidden_target) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) hidden_evals = torch.stack(hidden_evals, dim=1) hidden_targets = torch.stack(hidden_targets, dim=1) return q_evals, q_targets, hidden_evals, hidden_targets def _get_individual_inputs(self, batch, transition_idx): # 取出所有episode上该transition_idx的经验,u_onehot要取出所有,因为要用到上一条 obs, obs_next, u_onehot = batch['o'][:, transition_idx], \ batch['next_o'][:, transition_idx], batch['onehot_a'][:] episode_num = obs.shape[0] inputs, inputs_next = [], [] inputs.append(obs) inputs_next.append(obs_next) # 给obs添加上一个动作、agent编号 if self.args.last_action: if transition_idx == 0: # 如果是第一条经验,就让前一个动作为0向量 inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) inputs_next.append(u_onehot[:, transition_idx]) if self.args.reuse_network: # 因为当前的obs三维的数据,每一维分别代表(episode编号,agent编号,obs维度),直接在dim_1上添加对应的向量 # 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的 # agent编号恰好就是一个单位矩阵,即对角线为1,其余为0 inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1)) inputs_next.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1)) # 要把obs中的三个拼起来,并且要把episode_num个episode、self.args.n_agents个agent的数据拼成40条(40,96)的数据, # 因为这里所有agent共享一个神经网络,每条数据中带上了自己的编号,所以还是自己的数据 inputs = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) inputs_next = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs_next], dim=1) return inputs, inputs_next def get_qtran(self, batch, hidden_evals, hidden_targets, local_opt_actions, hat=False): episode_num, max_episode_len, _, _ = hidden_targets.shape states = batch['s'][:, :max_episode_len] states_next = batch['next_s'][:, :max_episode_len] u_onehot = batch['onehot_a'][:, :max_episode_len] if self.args.cuda: states = states.cuda() states_next = states_next.cuda() u_onehot = u_onehot.cuda() hidden_evals = hidden_evals.cuda() hidden_targets = hidden_targets.cuda() local_opt_actions = local_opt_actions.cuda() if hat: # 神经网络输出的q_eval、q_target、v的维度为(episode_num * max_episode_len, 1) q_evals = self.eval_joint_q(states, hidden_evals, local_opt_actions) q_targets = None v = None # 把q_eval维度变回(episode_num, max_episode_len) q_evals = q_evals.view(episode_num, -1, 1).squeeze(-1) else: q_evals = self.eval_joint_q(states, hidden_evals, u_onehot) q_targets = self.target_joint_q(states_next, hidden_targets, local_opt_actions) v = self.v(states, hidden_evals) # 把q_eval、q_target、v维度变回(episode_num, max_episode_len) q_evals = q_evals.view(episode_num, -1, 1).squeeze(-1) q_targets = q_targets.view(episode_num, -1, 1).squeeze(-1) v = v.view(episode_num, -1, 1).squeeze(-1) return q_evals, q_targets, v def init_hidden(self, episode_num): # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden self.eval_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) def save_model(self, train_step): # num = str(train_step // self.args.save_cycle) # if not os.path.exists(self.model_dir): # os.makedirs(self.model_dir) if not os.path.exists(self.args.model_dir): os.makedirs(self.args.model_dir) if type(train_step) == str: num = train_step else: num = str(train_step // self.args.save_model_period) torch.save(self.eval_rnn.state_dict(), self.args.model_dir + '/' + num + '_rnn_net_params.pkl') torch.save(self.eval_joint_q.state_dict(), self.args.model_dir + '/' + num + '_joint_q_params.pkl') torch.save(self.v.state_dict(), self.args.model_dir + '/' + num + '_v_params.pkl')
class QMIX_PG(): def __init__(self, agent, args): self.args = args self.log_alpha = torch.zeros( 1, dtype=torch.float32) # , requires_grad=True) if args.cuda: self.log_alpha = self.log_alpha.cuda() self.log_alpha.requires_grad = True self.alpha = self.args.alpha self.alpha_optimizer = torch.optim.Adam([self.log_alpha], lr=3e-4) self.agent = agent # self.target_policy = copy.deepcopy(self.agent.policy) # TODO self.policy_params = list(agent.policy.parameters()) self.policy_optimiser = torch.optim.RMSprop( params=self.policy_params, lr=args.actor_lr) # , alpha=args.optim_alpha, eps=args.optim_eps) self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: input_shape += self.n_actions if args.reuse_network: input_shape += self.n_agents # 神经网络 self.eval_rnn = RNN(input_shape, args) # 每个agent选动作的网络 self.target_rnn = RNN(input_shape, args) self.eval_rnn_2 = RNN(input_shape, args) # 每个agent选动作的网络 self.target_rnn_2 = RNN(input_shape, args) self.eval_qmix_net = QMixNet(args) # 把agentsQ值加起来的网络 self.target_qmix_net = QMixNet(args) self.eval_qmix_net_2 = QMixNet(args) # 把agentsQ值加起来的网络 self.target_qmix_net_2 = QMixNet(args) self.args = args if self.args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.eval_rnn_2.cuda() self.target_rnn_2.cuda() self.eval_qmix_net.cuda() self.target_qmix_net.cuda() self.eval_qmix_net_2.cuda() self.target_qmix_net_2.cuda() # self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map tmp = f'clamp2-5_rewardscale10_' + f'{args.buffer_size}_{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}' self.model_dir = args.model_dir + '/linear_mix/' + 'qmix_sac_cf' + '/' + tmp + '/' + args.map # _gradclip0.5 # 如果存在模型则加载模型 if self.args.load_model: if os.path.exists(self.model_dir + '/rnn_net_params.pkl'): path_rnn = self.model_dir + '/rnn_net_params.pkl' path_qmix = self.model_dir + '/qmix_net_params.pkl' path_policy = self.model_dir + '/policy_net_params.pkl' map_location = 'cuda:0' if self.args.cuda else 'cpu' self.eval_rnn.load_state_dict( torch.load(path_rnn, map_location=map_location)) self.eval_rnn_2.load_state_dict( torch.load(path_rnn, map_location=map_location)) self.eval_qmix_net.load_state_dict( torch.load(path_qmix, map_location=map_location)) self.eval_qmix_net_2.load_state_dict( torch.load(path_qmix, map_location=map_location)) # self.target_policy.load_state_dict(torch.load(path_policy, map_location=map_location)) # TODO print('Successfully load the model: {} and {}'.format( path_rnn, path_qmix)) else: raise Exception("No model!") # 让target_net和eval_net的网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_rnn_2.load_state_dict(self.eval_rnn.state_dict()) self.target_qmix_net.load_state_dict(self.eval_qmix_net.state_dict()) self.target_qmix_net_2.load_state_dict( self.eval_qmix_net_2.state_dict()) # self.target_policy.load_state_dict(self.agent.policy.state_dict()) # TODO self.eval_parameters = list(self.eval_qmix_net.parameters()) + list( self.eval_rnn.parameters()) if args.optimizer == "RMS": self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.critic_lr) self.eval_parameters_2 = list( self.eval_qmix_net_2.parameters()) + list( self.eval_rnn_2.parameters()) if args.optimizer == "RMS": self.optimizer_2 = torch.optim.RMSprop(self.eval_parameters_2, lr=args.critic_lr) # 执行过程中,要为每个agent都维护一个eval_hidden # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden、target_hidden self.eval_hidden = None self.target_hidden = None print('Init alg QMIX') def train_actor(self, batch, max_episode_len, actor_sample_times=1): # EpisodeBatch episode_num = batch['o'].shape[0] self.init_hidden(episode_num) for key in batch.keys(): # 把batch里的数据转化成tensor if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) # s, s_next, u, r, avail_u, avail_u_next, terminated = batch['s'], batch['s_next'], batch['u'], \ # batch['r'], batch['avail_u'], batch['avail_u_next'], \ # batch['terminated'] s = batch['s'] terminated = batch['terminated'] mask = 1 - batch["padded"].float() # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) mask = mask.repeat(1, 1, self.n_agents).view(-1) # mask = mask.repeat(1, 1, self.n_agents) # actions = batch["u"][:, :-1] actions = batch["u"] avail_u = batch["avail_u"] # terminated = batch["terminated"][:, :-1].float() # avail_actions = batch["avail_u"][:, :-1] if self.args.cuda: s = s.cuda() # u = u.cuda() # r = r.cuda() # s_next = s_next.cuda() # terminated = terminated.cuda() actions = actions.cuda() avail_u = avail_u.cuda() mask = mask.cuda() # # build q # inputs = self.critic._build_inputs(batch, bs, max_t) # q_vals = self.critic.forward(inputs).detach()[:, :-1] episode_num = batch['o'].shape[0] q_evals, q_targets = [], [] actions_prob = [] actions_logprobs = [] actions_probs_nozero = [] # self.agent.init_hidden(batch.batch_size) # self.policy_hidden= self.policy.init_hidden().unsqueeze(0).expand(batch_size, self.n_agents, -1) # bav for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs( batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() q_eval, self.eval_hidden = self.eval_rnn( inputs, self.eval_hidden ) # inputs维度为(40,96),得到的q_eval维度为(40,n_actions) Q_i # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) agent_outs, self.policy_hidden = self.agent.policy( inputs, self.policy_hidden) avail_actions_ = avail_u[:, transition_idx] reshaped_avail_actions = avail_actions_.reshape( episode_num * self.n_agents, -1) agent_outs[reshaped_avail_actions == 0] = -1e11 # agent_outs = torch.nn.functional.softmax(agent_outs, dim=-1) # agent_outs = gumbel_softmax(agent_outs, hard=False) agent_outs = F.softmax(agent_outs / 1, dim=1) # 概率分布 # If hard=True, then the returned sample will be one-hot, otherwise it will # be a probabilitiy distribution that sums to 1 across classes agent_outs = agent_outs.view(episode_num, self.n_agents, -1) actions_prob.append(agent_outs) # 每个动作的概率 # actions_prob_nozero = agent_outs.clone() # actions_prob_nozero[reshaped_avail_actions.view(episode_num, self.n_agents, -1) == 0] = 1e-11 # # TODO 概率没有0 # actions_probs_nozero.append(actions_prob_nozero) # Have to deal with situation of 0.0 probabilities because we can't do log 0 z = agent_outs == 0.0 z = z.float() * 1e-8 actions_logprobs.append(torch.log(agent_outs + z)) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_vals = torch.stack(q_evals, dim=1) actions_prob = torch.stack(actions_prob, dim=1) # Concat over time log_prob_pi = torch.stack(actions_logprobs, dim=1) # cur_pol_sample_actions = np.random.choice(np.arange(self.n_actions), 1, # p=actions_prob.detach().cpu().numpy()) # action是一个整数 按概率分布采样 # actor_sample_times=1 # for i in range(actor_sample_times): # todo actor_sample_times = 1 samples = torch.multinomial( actions_prob.view(-1, self.n_actions), actor_sample_times, replacement=True) # a.shape = (batch_size, num_a) cur_pol_sample_actions = samples.view( episode_num, -1, self.n_agents, actor_sample_times) # max_episode_len #### actor_sample_times=1 q_curpi_sample_actions = torch.gather( q_vals, dim=3, index=cur_pol_sample_actions).squeeze(3) # (1,60,3) #### actor_sample_times # q_curpi_sample_actions = torch.gather(q_vals, dim=3,index=cur_pol_sample_actions) # (1,60,3) todo actor_sample_times # q_curpi_sample_actions = q_curpi_sample_actions.permute(3, 0, 1, 2).reshape(-1, max_episode_len, self.n_agents) # todo actor_sample_times # s = s.repeat(repeats=(actor_sample_times, 1, 1)) # (1,60,3) todo actor_sample_times # mask = mask.repeat(repeats=(actor_sample_times, 1, 1)).view(-1) # (1,60,3) todo actor_sample_times q_curpi_sample = self.eval_qmix_net(q_curpi_sample_actions, s).detach() # TODO # (1,60,1) Q_curpi_sample = q_curpi_sample.repeat( repeats=(1, 1, self.n_agents)) # (1,60,3) # q_targets_mean = torch.sum(actions_prob * q_vals, dim=-1).view(-1).detach() # (180) # q_i_mean_negi_mean = torch.sum(actions_prob * (-self.alpha * log_prob_pi + q_vals), dim=-1) # (1,60,3) # TODO q_i_mean_negi_mean = torch.sum(actions_prob * q_vals, dim=-1) # (1,60,3) # TODO # q_i_mean_negi_mean = q_i_mean_negi_mean.repeat(repeats=(actor_sample_times, 1, 1)) # todo actor_sample_times q_i_mean_negi_sample = [] # torch.stack( [torch.cat((q_targets_mean[:, :, i].unsqueeze(2), torch.cat((q_taken[:, :, :i], q_taken[:, :, i + 1:]), dim=2)), # dim=2) for i in range(3)],dim=1) # 顺序不对 q_curpi_sample_actions_detached = q_curpi_sample_actions.detach() for i in range(self.n_agents): q_temp = copy.deepcopy(q_curpi_sample_actions_detached) q_temp[:, :, i] = q_i_mean_negi_mean[:, :, i] # q_i(mean_action) q_-i(tacken_action), q_i_mean_negi_sample.append(q_temp) q_i_mean_negi_sample = torch.stack(q_i_mean_negi_sample, dim=2) # [1, 60, 3, 3] q_i_mean_negi_sample = q_i_mean_negi_sample.view( episode_num, -1, self.n_agents) # [1, 60*3, 3] # s_repeat = s.repeat(repeats=(1, self.n_agents, 1)) # (1,60,48)-> # (1,60*3,48) #TODO 顺序错误 s_repeat = s.repeat( repeats=(1, 1, self.n_agents)) # (1,60,48)-> # (1,60,48*3) # s_repeat = s_repeat.view(episode_num, self.n_agents, -1) # (1,60,48*3)-> # (1,3,48*60)#TODO 一样的 s_repeat = s_repeat.view( episode_num, self.n_agents * max_episode_len, self.args.state_shape) # (1,60,48*3)-> # (1,60*3,48)# Q_i_mean_negi_sample = self.eval_qmix_net( q_i_mean_negi_sample, s_repeat).detach() # TODO # (1,60*3,1) # q_total_target = q_total_target.repeat(repeats=(1, 1, self.n_agents)) # (1,60,3) Q_i_mean_negi_sample = Q_i_mean_negi_sample.view( episode_num, -1, self.n_agents) # (1,60*3,1)->(1,60,3) ###方法1 # pi = actions_prob.view(-1, self.n_actions) # # Calculate policy grad with mask # pi_sample = torch.gather(pi, dim=1, index=cur_pol_sample_actions.reshape(-1, 1)).squeeze(1) # pi_sample[mask == 0] = 1.0 # log_pi_sample = torch.log(pi_sample) ###方法2 log_pi_sample = torch.gather( log_prob_pi, dim=3, index=cur_pol_sample_actions).squeeze(-1).view(-1) # advantages = (-self.alpha * log_pi_sample + (Q_curpi_sample.view(-1) - Q_i_mean_negi_sample.view(-1)).detach()) # TODO Bug? advantages = (-self.alpha * log_pi_sample + Q_curpi_sample.view(-1) - Q_i_mean_negi_sample.view(-1)).detach() # TODO # if self.args.advantage_norm: # TODO # EPS = 1e-10 # advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) policy_loss = -((advantages * log_pi_sample) * mask).sum() / mask.sum() # Optimise agents self.policy_optimiser.zero_grad() # don't want critic to accumulate gradients from policy loss disable_gradients(self.eval_rnn) disable_gradients(self.eval_qmix_net) policy_loss.backward() # policy gradient enable_gradients(self.eval_rnn) enable_gradients(self.eval_qmix_net) grad_norm = torch.nn.utils.clip_grad_norm_( self.policy_params, self.args.grad_norm_clip) # 0.5 todo self.policy_optimiser.step() # compute parameters sum for debugging p_sum = 0. for p in self.policy_params: p_sum += p.data.abs().sum().item() / 100.0 self.soft_update() def train_critic(self, batch, max_episode_len, train_step, epsilon=None): # train_step表示是第几次学习,用来控制更新target_net网络的参数 ''' 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode,然后一次给神经网络 传入每个episode的同一个位置的transition ''' episode_num = batch['o'].shape[0] self.init_hidden(episode_num) for key in batch.keys(): # 把batch里的数据转化成tensor if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) s, s_next, u, r, avail_u, avail_u_next, terminated = batch['s'], batch['s_next'], batch['u'], \ batch['r'], batch['avail_u'], batch['avail_u_next'], \ batch['terminated'] mask = 1 - batch["padded"].float() # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) mask = mask.repeat(1, 1, self.n_agents) # r = 10. * (r - r.mean()) / (r.std() + 1e-6) # normalize with batch mean and std; plus a small number to prevent numerical problem r = (r - r.mean()) / (r.std() + 1e-6) # todo # 得到每个agent对应的Q值,维度为(episode个数, max_episode_len, n_agents, n_actions) q_evals, q_targets = self.get_q_values(batch, max_episode_len) q_evals_2, q_targets_2 = self.get_q_values_2(batch, max_episode_len) if self.args.cuda: s = s.cuda() u = u.cuda() r = r.cuda() s_next = s_next.cuda() terminated = terminated.cuda() mask = mask.cuda() # 取每个agent动作对应的Q值,并且把最后不需要的一维去掉,因为最后一维只有一个值了 q_evals = torch.gather(q_evals, dim=3, index=u).squeeze(3) q_evals_2 = torch.gather(q_evals_2, dim=3, index=u).squeeze(3) episode_num = batch['o'].shape[0] q_targets_sample = [] actions_prob = [] actions_probs_nozero = [] actions_logprobs = [] # self.agent.init_hidden(batch.batch_size) # self.policy_hidden= self.policy.init_hidden().unsqueeze(0).expand(batch_size, self.n_agents, -1) # bav for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs( batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() # agent_outs, self.target_policy_hidden = self.target_policy(inputs_next, self.target_policy_hidden) agent_outs, self.policy_hidden = self.agent.policy( inputs_next, self.policy_hidden) avail_actions_ = avail_u[:, transition_idx] reshaped_avail_actions = avail_actions_.reshape( episode_num * self.n_agents, -1) agent_outs[reshaped_avail_actions == 0] = -1e11 # agent_outs = gumbel_softmax(agent_outs, hard=True) # one-hot TODO ac_sample1 # agent_outs = agent_outs.view(episode_num, self.n_agents, -1) # (1,3,9) # action_next = agent_outs.max(dim=2, keepdim=True)[1] # (1,3,1) # # action_next = torch.nonzero(agent_outs).squeeze() # actions_next_sample.append(action_next) # 选择动作的序号 # agent_outs = gumbel_softmax(agent_outs, hard=True) # 概率 TODO ac_mean agent_outs = F.softmax(agent_outs / 1, dim=1) # 概率分布 agent_outs = agent_outs.view(episode_num, self.n_agents, -1) # 概率有0 actions_prob.append(agent_outs) # 每个动作的概率 # actions_prob_nozero=agent_outs.clone() # actions_prob_nozero[reshaped_avail_actions.view(episode_num, self.n_agents, -1) == 0] = 1e-11 # 概率没有0 # actions_probs_nozero.append(actions_prob_nozero) # Have to deal with situation of 0.0 probabilities because we can't do log 0 z = agent_outs == 0.0 z = z.float() * 1e-8 actions_logprobs.append(torch.log(agent_outs + z)) actions_prob = torch.stack(actions_prob, dim=1) # Concat over time log_prob_pi = torch.stack(actions_logprobs, dim=1) # Concat over time # actions_probs_nozero = torch.stack(actions_probs_nozero, dim=1) # Concat over time # actions_probs_nozero[mask == 0] = 1.0 # log_prob_pi = torch.log(actions_probs_nozero) # Updating alpha wrt entropy # alpha = 0.0 # trade-off between exploration (max entropy) and exploitation (max Q) target_entropy = -1. * self.n_actions if self.args.auto_entropy is True: # alpha_loss = -(self.log_alpha * (log_prob + target_entropy).detach()).mean() #target_entropy=-2 alpha_loss = (torch.sum(actions_prob.detach() * (-self.log_alpha * (log_prob_pi + target_entropy).detach()), dim=-1) * mask).sum() / mask.sum() # print('alpha loss: ',alpha_loss) self.alpha_optimizer.zero_grad() alpha_loss.backward() self.alpha_optimizer.step() self.alpha = self.log_alpha.exp() else: self.alpha = 1. alpha_loss = 0 # Calculated baseline q_targets_sample = torch.sum( actions_prob * (q_targets - self.alpha * log_prob_pi), dim=-1).view(episode_num, max_episode_len, -1).detach() # (1,60,3) TODO ac_mean q_targets_sample_2 = torch.sum( actions_prob * (q_targets_2 - self.alpha * log_prob_pi), dim=-1).view(episode_num, max_episode_len, -1).detach() # (1,60,3) # actions_next_sample = torch.stack(actions_next_sample, dim=1) # Concat over time # (1,60,3,1) TODO ac_sample1 # # q_targets_sample = q_targets[:,:,actions_next] # q_targets_sample = torch.gather(q_targets, dim=3, index=actions_next_sample).squeeze(3) # (1,60,3) q_total_eval = self.eval_qmix_net(q_evals, s) # [1, 60, 1] q_total_target = self.target_qmix_net(q_targets_sample, s_next) q_total_eval_2 = self.eval_qmix_net_2(q_evals_2, s) # [1, 60, 1] q_total_target_2 = self.target_qmix_net_2(q_targets_sample_2, s_next) q_total_target_min = torch.min(q_total_target, q_total_target_2) q_total_target = q_total_target_min targets = r + self.args.gamma * q_total_target * (1 - terminated) ### update q1 td_error = (q_total_eval - targets.detach()) masked_td_error = mask * td_error # 抹掉填充的经验的td_error # 不能直接用mean,因为还有许多经验是没用的,所以要求和再比真实的经验数,才是真正的均值 loss = (masked_td_error**2).sum() / mask.sum() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.grad_norm_clip) self.optimizer.step() ### update q2 td_error_2 = (q_total_eval_2 - targets.detach()) masked_td_error_2 = mask * td_error_2 # 抹掉填充的经验的td_error # 不能直接用mean,因为还有许多经验是没用的,所以要求和再比真实的经验数,才是真正的均值 loss_2 = (masked_td_error_2**2).sum() / mask.sum() self.optimizer_2.zero_grad() loss_2.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters_2, self.args.grad_norm_clip) self.optimizer_2.step() # if train_step > 0 and train_step % self.args.target_update_cycle == 0: # 200 # self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) # self.target_qmix_net.load_state_dict(self.eval_qmix_net.state_dict()) # self.target_policy.load_state_dict(self.agent.policy.state_dict()) #TODO # Update the frozen target models if train_step % 10000 == 0: # how often to save the model args.save_cycle = 5000 self.save_model(train_step) # TODO def _get_inputs(self, batch, transition_idx): # 取出所有episode上该transition_idx的经验,u_onehot要取出所有,因为要用到上一条 obs, obs_next, u_onehot = batch['o'][:, transition_idx], \ batch['o_next'][:, transition_idx], batch['u_onehot'][:] episode_num = obs.shape[0] inputs, inputs_next = [], [] inputs.append(obs) inputs_next.append(obs_next) # 给obs添加上一个动作、agent编号 30+9+3 if self.args.last_action: if transition_idx == 0: # 如果是第一条经验,就让前一个动作为0向量 inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) inputs_next.append(u_onehot[:, transition_idx]) if self.args.reuse_network: # 因为当前的obs三维的数据,每一维分别代表(episode编号,agent编号,obs维度),直接在dim_1上添加对应的向量 # 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的 # agent编号恰好就是一个单位矩阵,即对角线为1,其余为0 inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) inputs_next.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) # 要把obs中的三个拼起来,并且要把episode_num个episode、self.args.n_agents个agent的数据拼成40条(40,96)的数据, # 因为这里所有agent共享一个神经网络,每条数据中带上了自己的编号,所以还是自己的数据 inputs = torch.cat( [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) inputs_next = torch.cat([ x.reshape(episode_num * self.args.n_agents, -1) for x in inputs_next ], dim=1) return inputs, inputs_next # (3,42) def get_q_values(self, batch, max_episode_len): episode_num = batch['o'].shape[0] q_evals, q_targets = [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs( batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() q_eval, self.eval_hidden = self.eval_rnn( inputs, self.eval_hidden ) # inputs维度为(40,96),得到的q_eval维度为(40,n_actions) q_target, self.target_hidden = self.target_rnn( inputs_next, self.target_hidden) # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_target = q_target.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) return q_evals, q_targets def get_q_values_2(self, batch, max_episode_len): episode_num = batch['o'].shape[0] q_evals, q_targets = [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs( batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() q_eval, self.eval_hidden_2 = self.eval_rnn_2( inputs, self.eval_hidden_2 ) # inputs维度为(40,96),得到的q_eval维度为(40,n_actions) q_target, self.target_hidden_2 = self.target_rnn_2( inputs_next, self.target_hidden_2) # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_target = q_target.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) return q_evals, q_targets def init_hidden(self, episode_num): # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden self.eval_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.eval_hidden_2 = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden_2 = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.policy_hidden = torch.zeros( (episode_num, self.n_agents, self.args.rnn_hidden_dim)) # self.target_policy_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) # TODO if self.args.cuda: self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() self.eval_hidden_2 = self.eval_hidden.cuda() self.target_hidden_2 = self.target_hidden.cuda() self.policy_hidden = self.policy_hidden.cuda() # self.target_policy_hidden = self.target_policy_hidden.cuda() def save_model(self, train_step): num = str(train_step // self.args.save_cycle) if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) torch.save(self.eval_qmix_net.state_dict(), self.model_dir + '/' + num + '_qmix_net_params.pkl') torch.save(self.eval_rnn.state_dict(), self.model_dir + '/' + num + '_rnn_net_params.pkl') torch.save(self.agent.policy.state_dict(), self.model_dir + '/' + num + '_policy_net_params.pkl') def soft_update(self): self.tau = 0.005 for param, target_param in zip(self.eval_rnn.parameters(), self.target_rnn.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.eval_qmix_net.parameters(), self.target_qmix_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.eval_rnn_2.parameters(), self.target_rnn_2.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.eval_qmix_net_2.parameters(), self.target_qmix_net_2.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
class Q_Decom: def __init__(self, args, itr): self.args = args input_shape = self.args.obs_shape # 调整RNN输入维度 if args.last_action: input_shape += self.args.n_actions if args.reuse_network: input_shape += self.args.n_agents # 设置网络 self.eval_rnn = RNN(input_shape, args) self.target_rnn = RNN(input_shape, args) # 通过数字标记,方便后续对算法类型进行判断 self.wqmix = 0 if self.args.alg == 'cwqmix' or self.args.alg == 'owqmix': self.wqmix = 1 # 默认值分解算法使用QMIX if 'qmix' in self.args.alg: self.eval_mix_net = QMIXMixer(args) self.target_mix_net = QMIXMixer(args) if self.wqmix > 0: self.qstar_eval_mix = QStar(args) self.qstar_target_mix = QStar(args) self.qstar_eval_rnn = RNN(input_shape, args) self.qstar_target_rnn = RNN(input_shape, args) if self.args.alg == 'cwqmix': self.alpha = 0.75 elif self.args.alg == 'owqmix': self.alpha = 0.5 else: raise Exception('没有这个算法') elif self.args.alg == 'vdn': self.eval_mix_net = VDNMixer() self.target_mix_net = VDNMixer() # 是否使用GPU if args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.eval_mix_net.cuda() self.target_mix_net.cuda() if self.wqmix > 0: self.qstar_eval_mix.cuda() self.qstar_target_mix.cuda() self.qstar_eval_rnn.cuda() self.qstar_target_rnn.cuda() # 是否加载模型 self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map + '/' + str( itr) if args.load_model: if os.path.exists(self.model_dir + '/rnn_net_params.pkl'): path_rnn = self.model_dir + '/rnn_net_params.pkl' path_mix = self.model_dir + '/' + self.args.alg + '_net_params.pkl' map_location = 'cuda:0' if args.cuda else 'cpu' self.eval_rnn.load_state_dict( torch.load(path_rnn, map_location=map_location)) self.eval_mix_net.load_state_dict( torch.load(path_mix, map_location=map_location)) if self.wqmix > 0: path_agent_rnn = self.model_dir + '/rnn_net_params2.pkl' path_qstar = self.model_dir + '/' + 'qstar_net_params.pkl' self.qstar_eval_rnn.load_state_dict( torch.load(path_agent_rnn, map_location=map_location)) self.qstar_eval_mix.load_state_dict( torch.load(path_qstar, map_location=map_location)) print('成功加载模型 %s' % path_rnn + ' 和 %s' % path_mix) else: raise Exception("模型不存在") # 令target网络与eval网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_mix_net.load_state_dict(self.eval_mix_net.state_dict()) # 获取所有参数 self.eval_params = list(self.eval_rnn.parameters()) + list( self.eval_mix_net.parameters()) # 学习过程中要为每个episode的每个agent维护一个eval_hidden,执行过程中要为每个agetn维护一个eval_hidden self.eval_hidden = None self.target_hidden = None if self.wqmix > 0: # 令target网络与eval网络参数相同 self.qstar_target_rnn.load_state_dict( self.qstar_eval_rnn.state_dict()) self.qstar_target_mix.load_state_dict( self.qstar_eval_mix.state_dict()) # 获取所有参数 self.qstar_params = list(self.qstar_eval_rnn.parameters()) + list( self.qstar_eval_mix.parameters()) # init hidden self.qstar_eval_hidden = None self.qstar_target_hidden = None # 获取优化器 if args.optim == 'RMS': self.optimizer = torch.optim.RMSprop(self.eval_params, lr=args.lr) if self.wqmix > 0: self.qstar_optimizer = torch.optim.RMSprop(self.qstar_params, lr=args.lr) else: self.optimizer = torch.optim.Adam(self.eval_params) if self.wqmix > 0: self.qstar_optimizer = torch.optim.Adam(self.qstar_params) print("值分解算法 " + self.args.alg + " 初始化") def learn(self, batch, max_episode_len, train_step, epsilon=None): """ 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。 因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode, 然后一次给神经网络传入每个episode的同一个位置的transition :param batch: :param max_episode_len: :param train_step: :param epsilon: :return: """ # 获得episode的数目 episode_num = batch['o'].shape[0] # 初始化隐藏状态 self.init_hidden(episode_num) # 数据转为tensor for key in batch.keys(): if key == 'a': batch[key] = torch.LongTensor(batch[key]) else: batch[key] = torch.Tensor(batch[key]) s, next_s, a, r, avail_a, next_avail_a, done = batch['s'], batch['next_s'], batch['a'], \ batch['r'], batch['avail_a'], batch['next_avail_a'], \ batch['done'] # 避免填充的产生 TD-error 影响训练 mask = 1 - batch["padded"].float() # 获取当前与下个状态的q值,(episode, max_episode_len, n_agents, n_actions) eval_qs, target_qs = self.get_q(batch, episode_num, max_episode_len) # 是否使用GPU if self.args.cuda: a = a.cuda() r = r.cuda() done = done.cuda() mask = mask.cuda() if 'qmix' in self.args.alg: s = s.cuda() next_s = next_s.cuda() # 得到每个动作对应的 q 值 eval_qs = torch.gather(eval_qs, dim=3, index=a).squeeze(3) # 计算Q_tot eval_q_total = self.eval_mix_net(eval_qs, s) qstar_q_total = None # 需要先把不行动作的mask掉 target_qs[next_avail_a == 0.0] = -9999999 if self.wqmix > 0: # TODO 找到使得Q_tot最大的联合动作,由于qmix是单调假设的,每个agent q值最大则 Q_tot最大,因此联合动作就是每个agent q值最大的动作 argmax_u = target_qs.argmax(dim=3).unsqueeze(3) qstar_eval_qs, qstar_target_qs = self.get_q( batch, episode_num, max_episode_len, True) # 获得对应的动作q值 qstar_eval_qs = torch.gather(qstar_eval_qs, dim=3, index=a).squeeze(3) qstar_target_qs = torch.gather(qstar_target_qs, dim=3, index=argmax_u).squeeze(3) # 通过前馈网络得到qstar qstar_q_total = self.qstar_eval_mix(qstar_eval_qs, s) next_q_total = self.qstar_target_mix(qstar_target_qs, next_s) else: # 得到 target q,是inf出现的nan # target_qs[next_avail_a == 0.0] = float('-inf') target_qs = target_qs.max(dim=3)[0] # 计算target Q_tot next_q_total = self.target_mix_net(target_qs, next_s) target_q_total = r + self.args.gamma * next_q_total * (1 - done) weights = torch.Tensor(np.ones(eval_q_total.shape)) if self.wqmix > 0: # 1- 可以保证weights在 (0, 1] # TODO: 这里只说是 (0, 1] 之间,也没说怎么设置还是学习,暂时认为是一个随机数 # weights = torch.Tensor(1 - np.random.ranf(eval_q_total.shape)) weights = torch.full(eval_q_total.shape, self.alpha) if self.args.alg == 'cwqmix': error = mask * (target_q_total - qstar_q_total) elif self.args.alg == 'owqmix': error = mask * (target_q_total - eval_q_total) else: raise Exception("模型不存在") weights[error > 0] = 1. # qstar 参数更新 qstar_error = mask * (qstar_q_total - target_q_total.detach()) qstar_loss = (qstar_error**2).sum() / mask.sum() self.qstar_optimizer.zero_grad() qstar_loss.backward() torch.nn.utils.clip_grad_norm_(self.qstar_params, self.args.clip_norm) self.qstar_optimizer.step() # 计算 TD error # TODO 这里权值detach有影响吗 td_error = mask * (eval_q_total - target_q_total.detach()) if self.args.cuda: weights = weights.cuda() loss = (weights.detach() * td_error**2).sum() / mask.sum() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_params, self.args.clip_norm) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_period == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_mix_net.load_state_dict(self.eval_mix_net.state_dict()) if self.wqmix > 0: self.qstar_target_rnn.load_state_dict( self.qstar_eval_rnn.state_dict()) self.qstar_target_mix.load_state_dict( self.qstar_eval_mix.state_dict()) def init_hidden(self, episode_num): """ 为每个episode中的每个agent都初始化一个eval_hidden,target_hidden :param episode_num: :return: """ self.eval_hidden = torch.zeros( (episode_num, self.args.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros( (episode_num, self.args.n_agents, self.args.rnn_hidden_dim)) if self.wqmix > 0: self.qstar_eval_hidden = torch.zeros( (episode_num, self.args.n_agents, self.args.rnn_hidden_dim)) self.qstar_target_hidden = torch.zeros( (episode_num, self.args.n_agents, self.args.rnn_hidden_dim)) def get_q(self, batch, episode_num, max_episode_len, wqmix=False): eval_qs, target_qs = [], [] for trans_idx in range(max_episode_len): # 每个obs加上agent编号和last_action inputs, next_inputs = self.get_inputs(batch, episode_num, trans_idx) # 是否使用GPU if self.args.cuda: inputs = inputs.cuda() next_inputs = next_inputs.cuda() if wqmix: self.qstar_eval_hidden = self.qstar_eval_hidden.cuda() self.qstar_target_hidden = self.qstar_target_hidden.cuda() else: self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() # 得到q值 if wqmix: eval_q, self.qstar_eval_hidden = self.qstar_eval_rnn( inputs, self.qstar_eval_hidden) target_q, self.qstar_target_hidden = self.qstar_target_rnn( next_inputs, self.qstar_target_hidden) else: eval_q, self.eval_hidden = self.eval_rnn( inputs, self.eval_hidden) target_q, self.target_hidden = self.target_rnn( next_inputs, self.target_hidden) # 形状变换 eval_q = eval_q.view(episode_num, self.args.n_agents, -1) target_q = target_q.view(episode_num, self.args.n_agents, -1) # 添加这个transition 的信息 eval_qs.append(eval_q) target_qs.append(target_q) # 将max_episode_len个(episode, n_agents, n_actions) 堆叠为 (episode, max_episode_len, n_agents, n_actions) eval_qs = torch.stack(eval_qs, dim=1) target_qs = torch.stack(target_qs, dim=1) return eval_qs, target_qs def get_inputs(self, batch, episode_num, trans_idx): # 取出所有episode上该trans_idx的经验,onehot_a要取出所有,因为要用到上一条 obs, next_obs, onehot_a = batch['o'][:, trans_idx], \ batch['next_o'][:, trans_idx], batch['onehot_a'][:] inputs, next_inputs = [], [] inputs.append(obs) next_inputs.append(next_obs) # 给obs添加上一个动作,agent编号 if self.args.last_action: if trans_idx == 0: inputs.append(torch.zeros_like(onehot_a[:, trans_idx])) else: inputs.append(onehot_a[:, trans_idx - 1]) next_inputs.append(onehot_a[:, trans_idx]) if self.args.reuse_network: """ 给数据增加agent编号,对于每个episode的数据,分为多个agent,每个agent编号为独热编码, 这样对于所有agent的编号堆叠起来就是一个单位矩阵 """ inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) next_inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) # 将之前append的数据合并,得到形状为(episode_num*n_agents, obs*(n_actions*n_agents)) inputs = torch.cat( [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) next_inputs = torch.cat([ x.reshape(episode_num * self.args.n_agents, -1) for x in next_inputs ], dim=1) return inputs, next_inputs def save_model(self, train_step): if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if type(train_step) == str: num = train_step else: num = str(train_step // self.args.save_model_period) torch.save( self.eval_mix_net.state_dict(), self.model_dir + '/' + num + '_' + self.args.alg + '_net_params.pkl') torch.save(self.eval_rnn.state_dict(), self.model_dir + '/' + num + '_rnn_params.pkl')
class DMAQ_qattenLearner: def __init__(self, args, itr): self.args = args input_shape = self.args.obs_shape # 调整RNN输入维度 if args.last_action: input_shape += self.args.n_actions if args.reuse_network: input_shape += self.args.n_agents # 设置网络 self.eval_rnn = RNN(input_shape, args) self.target_rnn = RNN(input_shape, args) # 默认值分解算法使用 dmaq if self.args.alg == 'dmaq_qatten': self.mixer = DMAQ_QattenMixer() self.target_mixer = DMAQ_QattenMixer() elif self.args.alg == 'dmaq': self.mixer = DMAQer(args) self.target_mixer = DMAQer(args) else: raise Exception('Unsupported!') # 是否使用GPU if args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.mixer.cuda() self.target_mixer.cuda() # 是否加载模型 # self.model_dir = args.model_dir + '/' + args.alg + '/' + args.map + '/' + str(itr) # self.model_dir = args.model_dir + '/' + args.map + '/' + args.alg + '_' + str(self.args.epsilon_anneal_steps // 10000) + 'w' + '/' + str(itr) if args.load_model: if os.path.exists(self.args.model_dir + '/rnn_net_params.pkl'): path_rnn = self.args.model_dir + '/rnn_net_params.pkl' path_mix = self.args.model_dir + '/' + self.args.alg + '_net_params.pkl' map_location = 'cuda:0' if args.cuda else 'cpu' self.eval_rnn.load_state_dict( torch.load(path_rnn, map_location=map_location)) self.mixer.load_state_dict( torch.load(path_mix, map_location=map_location)) print('成功加载模型 %s' % path_rnn + ' 和 %s' % path_mix) else: raise Exception("模型不存在") # 令target网络与eval网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_mixer.load_state_dict(self.mixer.state_dict()) # 获取所有参数 self.eval_params = list(self.eval_rnn.parameters()) + list( self.mixer.parameters()) # 学习过程中要为每个episode的每个agent维护一个eval_hidden,执行过程中要为每个agetn维护一个eval_hidden self.eval_hidden = None self.target_hidden = None # 获取优化器 if args.optim == 'RMS': self.optimizer = torch.optim.RMSprop(self.eval_params, lr=args.lr) else: self.optimizer = torch.optim.Adam(self.eval_params) print("值分解算法 " + self.args.alg + " 初始化") def learn(self, batch, max_episode_len, train_step): """ 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。 因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode, 然后一次给神经网络传入每个episode的同一个位置的transition :param batch: :param max_episode_len: :param train_step: :param epsilon: :return: """ # 获得episode的数目 episode_num = batch['o'].shape[0] # 初始化隐藏状态 self.init_hidden(episode_num) # 数据转为tensor for key in batch.keys(): if key == 'a': batch[key] = torch.LongTensor(batch[key]) else: batch[key] = torch.Tensor(batch[key]) s, next_s, a, r, avail_a, next_avail_a, done, actions_onehot = batch['s'], batch['next_s'], batch['a'], \ batch['r'], batch['avail_a'], batch[ 'next_avail_a'], \ batch['done'], batch['onehot_a'] # 避免填充的产生 TD-error 影响训练 mask = 1 - batch["padded"].float() # 获取当前与下个状态的q值,(episode, max_episode_len, n_agents, n_actions) eval_qs, target_qs = self.get_q(batch, episode_num, max_episode_len) # 是否使用GPU if self.args.cuda: a = a.cuda() r = r.cuda() done = done.cuda() mask = mask.cuda() s = s.cuda() next_s = next_s.cuda() actions_onehot = actions_onehot.cuda() # 得到每个动作对应的 q 值 eval_qsa = torch.gather(eval_qs, dim=3, index=a).squeeze(3) max_action_qvals = eval_qs.max(dim=3)[0] target_qs[next_avail_a == 0.0] = -9999999 target_qsa = target_qs.max(dim=3)[0] # target_max_actions = target_qs.argmax(dim=3).unsqueeze(3) # 计算Q_tot q_attend_regs = None if self.args.alg == "dmaq_qatten": ans_chosen, q_attend_regs, head_entropies = self.mixer(eval_qsa, s, is_v=True) ans_adv, _, _ = self.mixer(eval_qsa, s, actions=actions_onehot, max_q_i=max_action_qvals, is_v=False) eval_qsa = ans_chosen + ans_adv else: ans_chosen = self.mixer(eval_qsa, s, is_v=True) ans_adv = self.mixer(eval_qsa, s, actions=actions_onehot, max_q_i=max_action_qvals, is_v=False) eval_qsa = ans_chosen + ans_adv # if self.args.double_q: # if self.args.self.mixer == "dmaq_qatten": # target_chosen, _, _ = self.target_mixer(target_chosen_qvals, batch["state"][:, 1:], is_v=True) # target_adv, _, _ = self.target_mixer(target_chosen_qvals, batch["state"][:, 1:], # actions=cur_max_actions_onehot, # max_q_i=target_max_qvals, is_v=False) # target_max_qvals = target_chosen + target_adv # else: # target_chosen = self.target_mixer(target_chosen_qvals, batch["state"][:, 1:], is_v=True) # target_adv = self.target_mixer(target_chosen_qvals, batch["state"][:, 1:], # actions=cur_max_actions_onehot, # max_q_i=target_max_qvals, is_v=False) # target_max_qvals = target_chosen + target_adv # else: target_max_qvals = self.target_mixer(target_qsa, next_s, is_v=True) # Calculate 1-step Q-Learning targets targets = r + self.args.gamma * (1 - done) * target_max_qvals # Td-error td_error = (eval_qsa - targets.detach()) mask = mask.expand_as(td_error) # 0-out the targets that came from padded data masked_td_error = td_error * mask # Normal L2 loss, take mean over actual data loss = (masked_td_error**2).sum() / mask.sum() if self.args.alg == "dmaq_qatten": loss += q_attend_regs # else: # loss = (masked_td_error ** 2).sum() / mask.sum() self.optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_params, self.args.clip_norm) self.optimizer.step() if train_step > 0 and train_step % self.args.target_update_period == 0: self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_mixer.load_state_dict(self.mixer.state_dict()) def init_hidden(self, episode_num): """ 为每个episode中的每个agent都初始化一个eval_hidden,target_hidden :param episode_num: :return: """ self.eval_hidden = torch.zeros( (episode_num, self.args.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros( (episode_num, self.args.n_agents, self.args.rnn_hidden_dim)) def get_q( self, batch, episode_num, max_episode_len, ): eval_qs, target_qs = [], [] for trans_idx in range(max_episode_len): # 每个obs加上agent编号和last_action inputs, next_inputs = self.get_inputs(batch, episode_num, trans_idx) # 是否使用GPU if self.args.cuda: inputs = inputs.cuda() next_inputs = next_inputs.cuda() self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() # 得到q值 eval_q, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) target_q, self.target_hidden = self.target_rnn( next_inputs, self.target_hidden) # 形状变换 eval_q = eval_q.view(episode_num, self.args.n_agents, -1) target_q = target_q.view(episode_num, self.args.n_agents, -1) # 添加这个transition 的信息 eval_qs.append(eval_q) target_qs.append(target_q) # 将max_episode_len个(episode, n_agents, n_actions) 堆叠为 (episode, max_episode_len, n_agents, n_actions) eval_qs = torch.stack(eval_qs, dim=1) target_qs = torch.stack(target_qs, dim=1) return eval_qs, target_qs def get_inputs(self, batch, episode_num, trans_idx): # 取出所有episode上该trans_idx的经验,onehot_a要取出所有,因为要用到上一条 obs, next_obs, onehot_a = batch['o'][:, trans_idx], \ batch['next_o'][:, trans_idx], batch['onehot_a'][:] inputs, next_inputs = [], [] inputs.append(obs) next_inputs.append(next_obs) # 给obs添加上一个动作,agent编号 if self.args.last_action: if trans_idx == 0: inputs.append(torch.zeros_like(onehot_a[:, trans_idx])) else: inputs.append(onehot_a[:, trans_idx - 1]) next_inputs.append(onehot_a[:, trans_idx]) if self.args.reuse_network: """ 给数据增加agent编号,对于每个episode的数据,分为多个agent,每个agent编号为独热编码, 这样对于所有agent的编号堆叠起来就是一个单位矩阵 """ inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) next_inputs.append( torch.eye(self.args.n_agents).unsqueeze(0).expand( episode_num, -1, -1)) # 将之前append的数据合并,得到形状为(episode_num*n_agents, obs*(n_actions*n_agents)) inputs = torch.cat( [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) next_inputs = torch.cat([ x.reshape(episode_num * self.args.n_agents, -1) for x in next_inputs ], dim=1) return inputs, next_inputs def save_model(self, train_step): if not os.path.exists(self.args.model_dir): os.makedirs(self.args.model_dir) if type(train_step) == str: num = train_step else: num = str(train_step // self.args.save_model_period) torch.save( self.mixer.state_dict(), self.args.model_dir + '/' + num + '_' + self.args.alg + '_net_params.pkl') torch.save(self.eval_rnn.state_dict(), self.args.model_dir + '/' + num + '_rnn_params.pkl')
class QMIX_PG(): def __init__(self, agent, args): self.agent = agent self.target_policy = copy.deepcopy(self.agent.policy) # TODO self.policy_params = list(agent.policy.parameters()) self.policy_optimiser = torch.optim.RMSprop(params=self.policy_params, lr=args.actor_lr) # , alpha=args.optim_alpha, eps=args.optim_eps) self.n_actions = args.n_actions self.n_agents = args.n_agents self.state_shape = args.state_shape self.obs_shape = args.obs_shape input_shape = self.obs_shape # 根据参数决定RNN的输入维度 if args.last_action: input_shape += self.n_actions if args.reuse_network: input_shape += self.n_agents # 神经网络 self.eval_rnn = RNN(input_shape, args) # 每个agent选动作的网络 self.target_rnn = RNN(input_shape, args) self.eval_qmix_net = QMixNet(args) # 把agentsQ值加起来的网络 self.target_qmix_net = QMixNet(args) self.args = args if self.args.cuda: self.eval_rnn.cuda() self.target_rnn.cuda() self.eval_qmix_net.cuda() self.target_qmix_net.cuda() self.agent.policy.cuda() self.target_policy.cuda() tmp = f'clamp2-5_' + f'{args.loss_coeff_entropy}_' + f'{args.actor_buffer_size}_{args.critic_buffer_size}_{args.actor_train_steps}_{args.critic_train_steps}_' \ f'{args.actor_update_delay}_{args.critic_lr}' # f'{args.anneal_epsilon}_' self.model_dir = 'linear_mix/' + args.model_dir + '/qmix_ac_total_counterfactual' + '/' + tmp + '/' + args.map # 如果存在模型则加载模型 if self.args.load_model: if os.path.exists(self.model_dir + '/rnn_net_params.pkl'): path_rnn = self.model_dir + '/rnn_net_params.pkl' path_qmix = self.model_dir + '/qmix_net_params.pkl' path_policy = self.model_dir + '/policy_net_params.pkl' map_location = 'cuda:0' if self.args.cuda else 'cpu' self.eval_rnn.load_state_dict(torch.load(path_rnn, map_location=map_location)) self.eval_qmix_net.load_state_dict(torch.load(path_qmix, map_location=map_location)) self.target_policy.load_state_dict(torch.load(path_policy, map_location=map_location)) # TODO print('Successfully load the model: {} and {}'.format(path_rnn, path_qmix)) else: raise Exception("No model!") # 让target_net和eval_net的网络参数相同 self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) self.target_qmix_net.load_state_dict(self.eval_qmix_net.state_dict()) self.target_policy.load_state_dict(self.agent.policy.state_dict()) # TODO self.eval_parameters = list(self.eval_qmix_net.parameters()) + list(self.eval_rnn.parameters()) if args.optimizer == "RMS": self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.critic_lr) # 执行过程中,要为每个agent都维护一个eval_hidden # 学习过程中,要为每个episode的每个agent都维护一个eval_hidden、target_hidden self.eval_hidden = None self.target_hidden = None print('Init alg QMIX') def train_actor(self, batch, max_episode_len, update_agent_id=0): # EpisodeBatch # Get the relevant quantities # bs = batch.batch_size # max_t = batch.max_seq_length # actions = batch["u"][:, :-1] # terminated = batch["terminated"][:, :-1].float() # avail_actions = batch["avail_u"][:, :-1] # # mask = batch["filled"][:, :-1].float() # mask = 1 - batch["padded"].float() # mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) # mask = mask.repeat(1, 1, self.n_agents).view(-1) # states = batch["s"][:, :-1] episode_num = batch['o'].shape[0] self.init_hidden(episode_num) for key in batch.keys(): # 把batch里的数据转化成tensor if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) # s, s_next, u, r, avail_u, avail_u_next, terminated = batch['s'], batch['s_next'], batch['u'], \ # batch['r'], batch['avail_u'], batch['avail_u_next'], \ # batch['terminated'] s = batch['s'] terminated = batch['terminated'] mask = 1 - batch["padded"].float() # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 mask[:, 1:] = mask[:, 1:] * (1 - terminated[:, :-1]) mask = mask.repeat(1, 1, self.n_agents).view(-1) # actions = batch["u"][:, :-1] actions = batch["u"] avail_u = batch["avail_u"] # terminated = batch["terminated"][:, :-1].float() # avail_actions = batch["avail_u"][:, :-1] if self.args.cuda: s = s.cuda() # u = u.cuda() # r = r.cuda() # s_next = s_next.cuda() # terminated = terminated.cuda() actions = actions.cuda() avail_u = avail_u.cuda() mask = mask.cuda() # # build q # inputs = self.critic._build_inputs(batch, bs, max_t) # q_vals = self.critic.forward(inputs).detach()[:, :-1] # episode_num = batch['o'].shape[0] q_evals, q_targets = [], [] actions_prob = [] # self.agent.init_hidden(batch.batch_size) # self.policy_hidden= self.policy.init_hidden().unsqueeze(0).expand(batch_size, self.n_agents, -1) # bav for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs(batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() q_eval, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) # inputs维度为(40,96),得到的q_eval维度为(40,n_actions) Q_i # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) agent_outs, self.policy_hidden = self.agent.policy(inputs, self.policy_hidden) avail_actions_ = avail_u[:, transition_idx] reshaped_avail_actions = avail_actions_.reshape(episode_num * self.n_agents, -1) agent_outs[reshaped_avail_actions == 0] = -1e11 # agent_outs = torch.nn.functional.softmax(agent_outs, dim=-1) # agent_outs = gumbel_softmax(agent_outs, hard=False) #todo agent_outs = F.softmax(agent_outs / 1, dim=1) # 概率分布 # If hard=True, then the returned sample will be one-hot, otherwise it will # be a probabilitiy distribution that sums to 1 across classes agent_outs = agent_outs.view(episode_num, self.n_agents, -1) actions_prob.append(agent_outs) # 每个动作的概率 # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_vals = torch.stack(q_evals, dim=1) # mac_out = [] # self.mac.init_hidden(batch.batch_size) # for t in range(batch.max_seq_length - 1): # agent_outs = self.mac.forward(batch, t=t) # mac_out.append(agent_outs) actions_prob = torch.stack(actions_prob, dim=1) # Concat over time # Mask out unavailable actions, renormalise (as in action selection) # actions_prob[avail_actions == 0] = 0 # actions_prob = actions_prob / actions_prob.sum(dim=-1, keepdim=True) # actions_prob[avail_actions == 0] = 0 # Calculated baseline q_taken_actions = torch.gather(q_vals, dim=3, index=actions).squeeze(3) # (1,60,3) TODO NOTE q_taken = self.eval_qmix_net(q_taken_actions, s).detach() # TODO # (1,60,1) Q_taken = q_taken.repeat(repeats=(1, 1, self.n_agents)) # (1,60,3) pi = actions_prob.view(-1, self.n_actions) # q_targets_mean = torch.sum(actions_prob * q_vals, dim=-1).view(-1).detach() # (180) q_mean_actions = torch.sum(actions_prob * q_vals, dim=-1).detach() # (1,60,3) q_i_mean_negi_taken = [] # torch.stack( [torch.cat((q_targets_mean[:, :, i].unsqueeze(2), torch.cat((q_taken[:, :, :i], q_taken[:, :, i + 1:]), dim=2)), # dim=2) for i in range(3)],dim=1) # 顺序不对 q_taken_actions_detached = q_taken_actions.detach() for i in range(self.n_agents): q_temp = copy.deepcopy(q_taken_actions_detached) q_temp[:, :, i] = q_mean_actions[:, :, i] # q_i(mean_action) q_-i(tacken_action), q_i_mean_negi_taken.append(q_temp) q_i_mean_negi_taken = torch.stack(q_i_mean_negi_taken, dim=2) # [1, 60, 3, 3] q_i_mean_negi_taken = q_i_mean_negi_taken.view(episode_num, -1, self.n_agents) # [1, 60*3, 3] # TODO # s_repeat = s.repeat(repeats=(1, self.n_agents, 1)) # (1,60,48)-> # (1,60*3,48) #TODO 顺序错误 s_repeat = s.repeat(repeats=(1, 1, self.n_agents)) # (1,60,48)-> # (1,60,48*3) s_repeat = s_repeat.view(episode_num, self.n_agents, -1) # (1,60,48*3)-> # (1,60*3,48) Q_i_mean_negi_taken = self.eval_qmix_net(q_i_mean_negi_taken, s_repeat).detach() # TODO # (1,60*3,1) # q_total_target = q_total_target.repeat(repeats=(1, 1, self.n_agents)) # (1,60,3) Q_i_mean_negi_taken = Q_i_mean_negi_taken.view(episode_num, -1, self.n_agents) # (1,60*3,1)->(1,60,3) # Calculate policy grad with mask pi_taken = torch.gather(pi, dim=1, index=actions.reshape(-1, 1)).squeeze(1) pi_taken[mask == 0] = 1.0 log_pi_taken = torch.log(pi_taken) # # TODO normalnize # advantages_original=(q_taken.view(-1,self.n_agents)-baseline.view(-1,self.n_agents)) # advantages = (advantages_original - advantages_original.mean(dim=0)) / advantages_original.std(dim=0) # # # advantages = (advantages_original-advantages_original.mean(dim=1).repeat(repeats=(1, max_episode_len, 1)))/advantages_original.std(dim=1).repeat(repeats=(1, max_episode_len, 1)) # advantages=advantages.view(-1).detach() # TODO no normalnize advantages = (Q_taken.view(-1) - Q_i_mean_negi_taken.view(-1)).detach() # advantages = (Q_taken.view(-1) ).detach()#TODO if self.args.advantage_norm: # TODO EPS = 1e-10 advantages = (advantages - advantages.mean()) / (advantages.std() + EPS) # advantages = (q_taken.view(-1) - baseline.view(-1)).detach() # advantages = (-log_pi_taken + q_taken.view(-1).detach() - baseline) # TODO # import numpy as np # update_agent_id = np.random.randint(0, self.n_agents) # TODO # update_agent_id=update_agent_id+1 if update_agent_id<self.n_agents-1 else 0 # policy_loss = - ((advantages[update_agent_id::self.n_agents] * log_pi_taken[update_agent_id::self.n_agents]) * mask[update_agent_id::self.n_agents]).sum() / mask[update_agent_id::self.n_agents].sum() policy_loss = - ((advantages * log_pi_taken) * mask).sum() / mask.sum() # policy_entropy = torch.mean(torch.exp(log_pi_taken) * log_pi_taken) # regu_policy_loss = policy_loss + self.args.loss_coeff_entropy * policy_entropy # Optimise agents self.policy_optimiser.zero_grad() policy_loss.backward() # policy gradient grad_norm = torch.nn.utils.clip_grad_norm_(self.policy_params, self.args.grad_norm_clip) self.policy_optimiser.step() self.soft_update() if policy_loss.item() > 1e3: print("policy_loss:", policy_loss.item()) def train_critic(self, batch, max_episode_len, train_step): # train_step表示是第几次学习,用来控制更新target_net网络的参数 ''' 在learn的时候,抽取到的数据是四维的,四个维度分别为 1——第几个episode 2——episode中第几个transition 3——第几个agent的数据 4——具体obs维度。因为在选动作时不仅需要输入当前的inputs,还要给神经网络输入hidden_state, hidden_state和之前的经验相关,因此就不能随机抽取经验进行学习。所以这里一次抽取多个episode,然后一次给神经网络 传入每个episode的同一个位置的transition ''' episode_num = batch['o'].shape[0] self.init_hidden(episode_num) for key in batch.keys(): # 把batch里的数据转化成tensor if key == 'u': batch[key] = torch.tensor(batch[key], dtype=torch.long) else: batch[key] = torch.tensor(batch[key], dtype=torch.float32) s, s_next, u, r, avail_u, avail_u_next, terminated = batch['s'], batch['s_next'], batch['u'], \ batch['r'], batch['avail_u'], batch['avail_u_next'], \ batch['terminated'] mask = 1 - batch["padded"].float() # 用来把那些填充的经验的TD-error置0,从而不让它们影响到学习 r = (r - r.mean()) / (r.std() + 1e-6) # todo # 得到每个agent对应的Q值,维度为(episode个数, max_episode_len, n_agents, n_actions) q_evals, q_targets = self.get_q_values(batch, max_episode_len) if self.args.cuda: s = s.cuda() u = u.cuda() r = r.cuda() s_next = s_next.cuda() terminated = terminated.cuda() mask = mask.cuda() # 取每个agent动作对应的Q值,并且把最后不需要的一维去掉,因为最后一维只有一个值了 q_evals = torch.gather(q_evals, dim=3, index=u).squeeze(3) episode_num = batch['o'].shape[0] q_targets_sample = [] actions_prob = [] actions_next_sample = [] # self.agent.init_hidden(batch.batch_size) # self.policy_hidden= self.policy.init_hidden().unsqueeze(0).expand(batch_size, self.n_agents, -1) # bav for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs(batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() # agent_outs, self.target_policy_hidden = self.target_policy(inputs_next, self.target_policy_hidden) TODO agent_outs, self.policy_hidden = self.agent.policy(inputs_next, self.policy_hidden) avail_actions_ = avail_u[:, transition_idx] reshaped_avail_actions = avail_actions_.reshape(episode_num * self.n_agents, -1) agent_outs[reshaped_avail_actions == 0] = -1e11 # agent_outs = gumbel_softmax(agent_outs, hard=True) # one-hot TODO ac_sample1 # agent_outs = agent_outs.view(episode_num, self.n_agents, -1) # (1,3,9) # action_next = agent_outs.max(dim=2, keepdim=True)[1] # (1,3,1) # # action_next = torch.nonzero(agent_outs).squeeze() # actions_next_sample.append(action_next) # 选择动作的序号 # agent_outs = gumbel_softmax(agent_outs, hard=True) # 概率 TODO ac_mean agent_outs = F.softmax(agent_outs / 1, dim=1) # 概率分布 agent_outs = agent_outs.view(episode_num, self.n_agents, -1) actions_prob.append(agent_outs) # 每个动作的概率 actions_prob = torch.stack(actions_prob, dim=1) # Concat over time # pi = actions_prob.view(-1, self.n_actions) # Calculated baseline baseline = torch.sum(actions_prob * q_targets, dim=-1).view(episode_num, max_episode_len, -1).detach() # (1,60,3) q_targets_sample = baseline # actions_next_sample = torch.stack(actions_next_sample, dim=1) # Concat over time # (1,60,3,1) TODO # # q_targets_sample = q_targets[:,:,actions_next] # q_targets_sample = torch.gather(q_targets, dim=3, index=actions_next_sample).squeeze(3) # (1,60,3) # # 得到target_q # q_targets[avail_u_next == 0.0] = - 9999999 # q_targets = q_targets.max(dim=3)[0] q_total_eval = self.eval_qmix_net(q_evals, s) # [1, 60, 1] q_total_target = self.target_qmix_net(q_targets_sample, s_next) targets = r + self.args.gamma * q_total_target * (1 - terminated) td_error = (q_total_eval - targets.detach()) masked_td_error = mask * td_error # 抹掉填充的经验的td_error # 不能直接用mean,因为还有许多经验是没用的,所以要求和再比真实的经验数,才是真正的均值 critic_loss = (masked_td_error ** 2).sum() / mask.sum() self.optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.grad_norm_clip) self.optimizer.step() if critic_loss.item() > 1e3: print("critic_loss:", critic_loss.item()) # if train_step > 0 and train_step % self.args.target_update_cycle == 0: # 200 # self.target_rnn.load_state_dict(self.eval_rnn.state_dict()) # self.target_qmix_net.load_state_dict(self.eval_qmix_net.state_dict()) # self.target_policy.load_state_dict(self.agent.policy.state_dict()) #TODO # Update the frozen target models if train_step % self.args.save_cycle == 0: # how often to save the model args.save_cycle = 5000 self.save_model(train_step) # TODO def _get_inputs(self, batch, transition_idx): # 取出所有episode上该transition_idx的经验,u_onehot要取出所有,因为要用到上一条 obs, obs_next, u_onehot = batch['o'][:, transition_idx], \ batch['o_next'][:, transition_idx], batch['u_onehot'][:] episode_num = obs.shape[0] inputs, inputs_next = [], [] inputs.append(obs) inputs_next.append(obs_next) # 给obs添加上一个动作、agent编号 30+9+3 if self.args.last_action: if transition_idx == 0: # 如果是第一条经验,就让前一个动作为0向量 inputs.append(torch.zeros_like(u_onehot[:, transition_idx])) else: inputs.append(u_onehot[:, transition_idx - 1]) inputs_next.append(u_onehot[:, transition_idx]) if self.args.reuse_network: # 因为当前的obs三维的数据,每一维分别代表(episode编号,agent编号,obs维度),直接在dim_1上添加对应的向量 # 即可,比如给agent_0后面加(1, 0, 0, 0, 0),表示5个agent中的0号。而agent_0的数据正好在第0行,那么需要加的 # agent编号恰好就是一个单位矩阵,即对角线为1,其余为0 inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1)) inputs_next.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1)) # 要把obs中的三个拼起来,并且要把episode_num个episode、self.args.n_agents个agent的数据拼成40条(40,96)的数据, # 因为这里所有agent共享一个神经网络,每条数据中带上了自己的编号,所以还是自己的数据 inputs = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1) inputs_next = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs_next], dim=1) return inputs, inputs_next # (3,42) def get_q_values(self, batch, max_episode_len): episode_num = batch['o'].shape[0] q_evals, q_targets = [], [] for transition_idx in range(max_episode_len): inputs, inputs_next = self._get_inputs(batch, transition_idx) # 给obs加last_action、agent_id if self.args.cuda: inputs = inputs.cuda() inputs_next = inputs_next.cuda() self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() q_eval, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden) # inputs维度为(40,96),得到的q_eval维度为(40,n_actions) q_target, self.target_hidden = self.target_rnn(inputs_next, self.target_hidden) # 把q_eval维度重新变回(8, 5,n_actions) q_eval = q_eval.view(episode_num, self.n_agents, -1) q_target = q_target.view(episode_num, self.n_agents, -1) q_evals.append(q_eval) q_targets.append(q_target) # 得的q_eval和q_target是一个列表,列表里装着max_episode_len个数组,数组的的维度是(episode个数, n_agents,n_actions) # 把该列表转化成(episode个数, max_episode_len, n_agents,n_actions)的数组 q_evals = torch.stack(q_evals, dim=1) q_targets = torch.stack(q_targets, dim=1) return q_evals, q_targets def init_hidden(self, episode_num): # 为每个episode中的每个agent都初始化一个eval_hidden、target_hidden self.eval_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.policy_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) self.target_policy_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim)) # TODO if self.args.cuda: self.eval_hidden = self.eval_hidden.cuda() self.target_hidden = self.target_hidden.cuda() self.policy_hidden = self.policy_hidden.cuda() self.target_policy_hidden = self.target_policy_hidden.cuda() def save_model(self, train_step): num = str(train_step // self.args.save_cycle) if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) torch.save(self.eval_qmix_net.state_dict(), self.model_dir + '/' + num + '_qmix_net_params.pkl') torch.save(self.eval_rnn.state_dict(), self.model_dir + '/' + num + '_rnn_net_params.pkl') torch.save(self.agent.policy.state_dict(), self.model_dir + '/' + num + '_policy_net_params.pkl') def soft_update(self): self.tau = 0.005 for param, target_param in zip(self.eval_rnn.parameters(), self.target_rnn.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.eval_qmix_net.parameters(), self.target_qmix_net.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip(self.agent.policy.parameters(), self.target_policy.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)