def _init_agents(self): # parameter sharing self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.rnn = RNN_Model(self.hidden_dim, self.num_agents).to(self.device) self.attention = Attention_Model(self.hidden_dim).to(self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) self.rnn_target = RNN_Model(self.hidden_dim, self.num_agents).to(self.device) self.attention_target = Attention_Model(self.hidden_dim).to( self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) Dueling_DDQN_Learner.copy_network(self.rnn, self.rnn_target) Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target) self.share_para = chain(self.embedding.parameters(), self.attention.parameters(), self.rnn.parameters()) self.all_para = chain(self.embedding.parameters(), self.attention.parameters(), self.rnn.parameters()) # init the optimizer for i in range(self.num_agents): self.agents.append(Dueling_DDQN_Learner(self.config)) self.all_para = chain(self.all_para, self.agents[i].get_q_network().parameters()) # self.all_para = chain(self.all_para) self.share_optimizer = optim.RMSprop(self.all_para, lr=self.lr, weight_decay=1e-4)
def _init_agents(self): self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.attention = Attention_Model(self.hidden_dim).to(self.device) self.temporal_attention = Attention_Model(self.hidden_dim).to( self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) self.attention_target = Attention_Model(self.hidden_dim).to( self.device) self.temporal_attention_target = Attention_Model(self.hidden_dim).to( self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target) Dueling_DDQN_Learner.copy_network(self.temporal_attention, self.temporal_attention_target) for i in range(self.num_agents): q_network = Double_Attention_Model(self.input_dim, self.output_dim, self.hidden_dim).to(self.device) q_network_target = Double_Attention_Model( self.input_dim, self.output_dim, self.hidden_dim).to(self.device) q_network.set_layer_para(self.embedding, self.attention, self.temporal_attention) q_network_target.set_layer_para(self.embedding_target, self.attention_target, self.temporal_attention_target) self.agents[i].set_q_network(q_network, q_network_target) # def change_mode(self): # self.q_network.change_mode() # self.q_network_target.change_mode() # for i in range(self.num_agents): # self.agents[i].q_network_current.change_mode() # self.agents[i].q_network_target.change_mode() # self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim[0]) # self.attention = Attention_Layer(self.hidden_dim[0], self.hidden_dim[1], self.hidden_dim[2]) # self.embedding_target = Embedding_Layer(self.input_dim, self.hidden_dim[0]) # self.attention_target = Attention_Layer(self.hidden_dim[0], self.hidden_dim[1], self.hidden_dim[2]) # Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) # Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target) # def get_action(self, i, obs): # return self.agents[i].step(obs) # def store_experience(self, i, obs, action, reward, next_obs, is_done): # self.agents[i].store_experience(obs, action, reward, next_obs, is_done)
def __init__(self, input_dim, output_dim, hidden_dim): super().__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim self.pre_train = False self.device = "cuda:0" if torch.cuda.is_available() else "cpu" # if embedding_layer: # self.embedding = embedding_layer # self.attention = attention_layer # self.linear1 = nn.Linear(self.input_dim, self.output_dim) # else: self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim) self.attention = Attention_Model(self.hidden_dim) self.relu = nn.ReLU() self.linear1 = nn.Linear(self.hidden_dim, self.output_dim) self.attention_score = None
def __init__(self, input_dim, output_dim, hidden_dim): super().__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim) self.rnn = RNN_Model(self.hidden_dim, self.hidden_dim) self.attention = Attention_Model(self.hidden_dim) self.relu = nn.ReLU() self.linear_out = nn.Linear(self.hidden_dim, self.output_dim) self.hidden, self.cell = None, None
class LSTM_Attention_Agents(Attention_Agents): def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim, neighbor_map, node_name): super().__init__(config, num_agents, input_dim, hidden_dim, output_dim, neighbor_map, node_name) self._init_agents() self.hidden = np.zeros((self.num_agents, self.hidden_dim)) self.hidden_target = np.zeros((self.num_agents, self.hidden_dim)) self.cell = np.zeros((self.num_agents, self.hidden_dim)) self.cell_target = np.zeros((self.num_agents, self.hidden_dim)) def _init_agents(self): # parameter sharing self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.rnn = RNN_Model(self.hidden_dim, self.num_agents).to(self.device) self.attention = Attention_Model(self.hidden_dim).to(self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) self.rnn_target = RNN_Model(self.hidden_dim, self.num_agents).to(self.device) self.attention_target = Attention_Model(self.hidden_dim).to( self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) Dueling_DDQN_Learner.copy_network(self.rnn, self.rnn_target) Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target) self.share_para = chain(self.embedding.parameters(), self.attention.parameters(), self.rnn.parameters()) self.all_para = chain(self.embedding.parameters(), self.attention.parameters(), self.rnn.parameters()) # init the optimizer for i in range(self.num_agents): self.agents.append(Dueling_DDQN_Learner(self.config)) self.all_para = chain(self.all_para, self.agents[i].get_q_network().parameters()) # self.all_para = chain(self.all_para) self.share_optimizer = optim.RMSprop(self.all_para, lr=self.lr, weight_decay=1e-4) def _get_embedding(self, state): state_embedding = self.embedding(state) batch_size = state.shape[0] if batch_size == 1: # get hidden state to store self.hidden, self.cell = self.rnn.get_hidden_state() self.hidden_target, self.cell_target = self.rnn_target.get_hidden_state( ) state_hidden, _ = self.rnn(state_embedding) else: state_hidden, _ = self.rnn(state_embedding, self.hidden, self.cell) state_attention = self.attention(state_hidden, self.adj) return state_attention def _get_embedding_target(self, state): state_embedding_target = self.embedding_target(state) batch_size = state.shape[0] if batch_size == 1: state_hidden_target, _ = self.rnn_target(state_embedding_target) else: state_hidden_target, _ = self.rnn_target(state_embedding_target, self.hidden_target, self.cell_target) state_attention_target = self.attention_target(state_hidden_target, self.adj) return state_attention_target def _update_sharing_target_network(self): Dueling_DDQN_Learner.soft_update_of_target_network( self.embedding, self.embedding_target, self.tau) Dueling_DDQN_Learner.soft_update_of_target_network( self.rnn, self.rnn_target, self.tau) Dueling_DDQN_Learner.soft_update_of_target_network( self.attention, self.attention_target, self.tau) def store_experience(self, states, actions, rewards, next_states, is_dones): hidden = np.stack( (self.hidden, self.hidden_target, self.cell, self.cell_target), axis=1) self.buffer.store_experience(states, actions, rewards, next_states, is_dones, hidden) def sample_experience(self): states, actions, rewards, next_states, is_dones, hidden = self.buffer.sample_experience( ) # get hidden state self.hidden = hidden[:, 0] self.hidden_target = hidden[:, 1] self.cell = hidden[:, 2] self.cell_target = hidden[:, 3] return states, actions, rewards, next_states, is_dones def get_share_para(self): dic1 = dict(self.embedding.named_parameters()) dic2 = dict(self.rnn.named_parameters()) dic3 = dict(dic1, **dic2) dic4 = dict(self.attention.named_parameters()) return dict(dic3, **dic4)
class Basic_Agents: def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim): self.num_agents = num_agents self.input_dim = input_dim self.hidden_dim = hidden_dim self.output_dim = output_dim self.device = "cuda:0" if torch.cuda.is_available() else "cpu" self.config = config # Replay Buffer相关参数 self.batch_size = config['batch_size'] self.buffer_size = config['buffer_size'] self.buffer = Replay_Buffer(self.buffer_size, self.batch_size) self.lr = config['lr'] self.tau = config['tau'] self.agents = [] self.update_step = config['update_step'] self.curr_step = 0 self._init_agents() def _init_agents(self): self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) self.share_para = self.embedding.parameters() self.all_para = self.embedding.parameters() # init the optimizer for i in range(self.num_agents): self.agents.append(Dueling_DDQN_Learner(self.config)) self.all_para = chain(self.all_para, self.agents[i].get_q_network().parameters()) # para = chain(self.embedding.parameters(), self.agents[i].get_q_network().parameters()) # self.optimizer.append(optim.Adam(self.agents[i].get_q_network().parameters(), lr=1e-3)) # self.all_para = chain(self.all_para) self.share_optimizer = optim.RMSprop(self.all_para, lr=self.lr, weight_decay=1e-4) def get_agent(self, i): return self.agents[i] def step(self, state, test=False): state_embedding = self._get_embedding(state) action = [] for i in range(self.num_agents): action.append(self.agents[i].step(state_embedding[:, i], test)) action = np.asarray(action) self.curr_step += 1 return action def learn(self): # if self.curr_step > 0 and self.curr_step % self.update_step == 0: for i in range(self.update_step): states, actions, rewards, next_states, is_dones = self.sample_experience( ) actions = torch.from_numpy(actions).long().to(self.device) rewards = torch.from_numpy(rewards).float().to(self.device) is_dones = torch.from_numpy(is_dones).float().to(self.device) states_embedding = self._get_embedding(states) next_states_embedding = self._get_embedding(next_states) next_states_embedding_target = self._get_embedding_target( next_states) total_loss = 0 for i in range(self.num_agents): actions_values_current = self.agents[ i].cal_current_actions_value( next_states_embedding[:, i], next_states_embedding_target[:, i], rewards[:, i], is_dones) actions_values_expected = self.agents[ i].cal_expected_actions_value(states_embedding[:, i], actions[:, i]) loss = F.mse_loss(actions_values_expected, actions_values_current) # loss.backward(retain_graph=True) total_loss += loss # 反向传播 # self.optimizer[i].zero_grad() self.share_optimizer.zero_grad() total_loss.backward() # self._scale_shared_grads() torch.nn.utils.clip_grad_value_(self.all_para, 1) self.share_optimizer.step() for i in range(self.num_agents): # torch.nn.utils.clip_grad_value_(self.agents[i].q_network_current.parameters(), 1) # self.optimizer[i].step() # 更新target net Dueling_DDQN_Learner.soft_update_of_target_network( self.agents[i].q_network_current, self.agents[i].q_network_target, self.tau) self._update_sharing_target_network() # self.share_optimizer.zero_grad() def get_share_para(self): return dict(self.embedding.named_parameters()) def store_experience(self, states, actions, rewards, next_states, is_dones): self.buffer.store_experience(states, actions, rewards, next_states, is_dones) def sample_experience(self): states, actions, rewards, next_states, is_dones = self.buffer.sample_experience( ) return states, actions, rewards, next_states, is_dones def _get_embedding(self, state): return self.embedding(state) def _get_embedding_target(self, state): return self.embedding_target(state) def _update_sharing_target_network(self): Dueling_DDQN_Learner.soft_update_of_target_network( self.embedding, self.embedding_target, self.tau) def get_attention_score(self, i): return -1 def _scale_shared_grads(self): """ Scale gradients for parameters that are shared since they accumulate gradients from the critic loss function multiple times """ for p in self.share_para: p.grad.data.mul_(1. / self.num_agents) def save_model(self, path): share_model_name = path + '/share_model.pkl' torch.save(self.embedding.state_dict(), share_model_name) for i in range(self.num_agents): unique_model_name = path + '/q_network_%d.pkl' % i torch.save(self.agents[i].q_network_current.state_dict(), unique_model_name) def load_model(self, path): share_model_name = path + '/share_model.pkl' self.embedding.load_state_dict( torch.load(share_model_name, map_location=self.device)) for i in range(self.num_agents): unique_model_name = path + '/q_network_%d.pkl' % i self.agents[i].q_network_current.load_state_dict( torch.load(unique_model_name, map_location=self.device))
class Attention_Model(nn.Module): def __init__(self, input_dim, output_dim, hidden_dim): super().__init__() self.input_dim = input_dim self.output_dim = output_dim self.hidden_dim = hidden_dim self.pre_train = False self.device = "cuda:0" if torch.cuda.is_available() else "cpu" # if embedding_layer: # self.embedding = embedding_layer # self.attention = attention_layer # self.linear1 = nn.Linear(self.input_dim, self.output_dim) # else: self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim) self.attention = Attention_Model(self.hidden_dim) self.relu = nn.ReLU() self.linear1 = nn.Linear(self.hidden_dim, self.output_dim) self.attention_score = None # for para in self.attention.parameters(): # para.requires_grad = False ''' input: state: [batch, lane_num+1, feature_dim, neighbors_num] output: [batch, feature_out] ''' def forward(self, state): agent_state, neighbors_state = state[:, :, :, 0], state[:, :, :, 0:] neighbors_state = np.transpose(neighbors_state, (0, 3, 1, 2)) batch_size = state.shape[0] neighbors_num = state.shape[-1] # agent_state = torch.from_numpy(agent_state).float().to(self.device).unsqueeze(1).view(batch_size, 1, -1) # neighbors_state = torch.from_numpy(np.transpose(neighbors_state, (0, 3, 1, 2))).\ # float().to(self.device).contiguous().view(batch_size, neighbors_num, -1) agent_embedding = self.embedding(agent_state) # if self.pre_train: # out = self.linear1(agent_embedding) # return out # agent_embedding = agent_embedding.unsqueeze(1) # neighbors_embedding = None # for i in range(neighbors_num): # if i == 0: # neighbors_embedding = self.embedding(neighbors_state[:, i]).unsqueeze(1) # else: # neighbors_embedding = torch.cat((neighbors_embedding, # self.embedding(neighbors_state[:, i]).unsqueeze(1)), dim=1) # agent_embedding = agent_embedding.permute((1, 0, 2)) # neighbors_embedding = neighbors_embedding.permute((1, 0, 2)) # out, attention_score = self.attention(agent_embedding, neighbors_embedding) # out = out.squeeze(0) # out = self.relu(out) # self.attention_score = attention_score out = self.relu(agent_embedding) out = self.linear1(out) return out def change_mode(self): self.pre_train = False for para in self.embedding.parameters(): para.requires_grad = False for para in self.attention.parameters(): para.requires_grad = True def set_layer_para(self, embedding_layer=None, attention_layer=None): if embedding_layer is not None: self.embedding = embedding_layer if attention_layer is not None: self.attention = attention_layer def get_attention_score(self): att = self.attention_score.cpu().detach().numpy() idx = np.nonzero(att) att = att(idx) return att
class Double_Attention_Agents(Basic_Agents): def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim, seq_len, neighbor_map, node_name): super().__init__(config, num_agents, input_dim, hidden_dim, output_dim) self.adj = self._get_adj(neighbor_map, node_name) # self.n_heads = config['n_heads'] def _init_agents(self): # parameter sharing self.n_heads = self.config['n_heads'] self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.attention = Double_Attention_Model(self.hidden_dim, self.n_heads).to(self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) self.attention_target = Double_Attention_Model( self.hidden_dim, self.n_heads).to(self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target) self.share_para = chain(self.embedding.parameters(), self.attention.parameters()) self.all_para = chain(self.embedding.parameters(), self.attention.parameters()) # init the optimizer for i in range(self.num_agents): self.agents.append(Dueling_DDQN_Learner(self.config)) self.all_para = chain(self.all_para, self.agents[i].get_q_network().parameters()) self.share_optimizer = optim.RMSprop(self.all_para, lr=self.lr, weight_decay=1e-4) def _get_embedding(self, state): if len(state.shape) == 4: state = np.expand_dims(state, axis=0) state_embedding = self.embedding(state) state_attention = self.attention(state_embedding, self.adj) return state_attention def _get_embedding_target(self, state): state_embedding_target = self.embedding_target(state) state_attention_target = self.attention_target(state_embedding_target, self.adj) return state_attention_target def _update_sharing_target_network(self): Dueling_DDQN_Learner.soft_update_of_target_network( self.embedding, self.embedding_target, self.tau) Dueling_DDQN_Learner.soft_update_of_target_network( self.attention, self.attention_target, self.tau) def store_experience(self, states, actions, rewards, next_states, is_dones): states = np.expand_dims(states, axis=0) next_states = np.expand_dims(next_states, axis=0) self.buffer.store_experience(states, actions, rewards, next_states, is_dones) def _get_adj(self, neighbor_map, node_name): adj = np.zeros((self.num_agents, self.num_agents), dtype=bool) for i, node in enumerate(node_name): adj[i][i] = True for neighbor in neighbor_map[node]: idx = node_name.index(neighbor) adj[i][idx] = True return adj def get_share_para(self): dic1 = dict(self.embedding.named_parameters()) dic2 = dict(self.attention.named_parameters()) return dict(dic1, **dic2) def get_attention_score(self, i): return self.attention.get_attention_score(i)
class Attention_Agents(Basic_Agents): def __init__(self, config, num_agents, input_dim, hidden_dim, output_dim, neighbor_map, node_name): super().__init__(config, num_agents, input_dim, hidden_dim, output_dim) # self.q_network = Attention_Model(self.input_dim, self.output_dim, self.hidden_dim).to(self.device) # self.q_network_target = Attention_Model(self.input_dim, self.output_dim, self.hidden_dim).to(self.device) self._init_agents() self.adj = self._get_adj(neighbor_map, node_name) def _init_agents(self): # parameter sharing self.n_heads = self.config['n_heads'] self.embedding = Embedding_Layer(self.input_dim, self.hidden_dim).to(self.device) self.attention = Attention_Model(self.hidden_dim, self.n_heads).to(self.device) self.embedding_target = Embedding_Layer( self.input_dim, self.hidden_dim).to(self.device) self.attention_target = Attention_Model(self.hidden_dim, self.n_heads).to(self.device) Dueling_DDQN_Learner.copy_network(self.embedding, self.embedding_target) Dueling_DDQN_Learner.copy_network(self.attention, self.attention_target) # self.share_optimizer = optim.Adam(chain(self.embedding.parameters(), self.attention.parameters()), lr=1e-3) self.share_para = chain(self.embedding.parameters(), self.attention.parameters()) self.all_para = chain(self.embedding.parameters(), self.attention.parameters()) # init the optimizer for i in range(self.num_agents): self.agents.append(Dueling_DDQN_Learner(self.config)) self.all_para = chain(self.all_para, self.agents[i].get_q_network().parameters()) # self.all_para = chain(self.all_para) self.share_optimizer = optim.RMSprop(self.all_para, lr=self.lr, weight_decay=1e-4) def _get_embedding(self, state): state_embedding = self.embedding(state) state_attention = self.attention(state_embedding, self.adj) return state_attention def _get_embedding_target(self, state): state_embedding_target = self.embedding_target(state) state_attention_target = self.attention_target(state_embedding_target, self.adj) return state_attention_target def _update_sharing_target_network(self): Dueling_DDQN_Learner.soft_update_of_target_network( self.embedding, self.embedding_target, self.tau) Dueling_DDQN_Learner.soft_update_of_target_network( self.attention, self.attention_target, self.tau) def _get_adj(self, neighbor_map, node_name): adj = np.zeros((self.num_agents, self.num_agents), dtype=bool) for i, node in enumerate(node_name): adj[i][i] = True for neighbor in neighbor_map[node]: idx = node_name.index(neighbor) # idx = int(neighbor[2:] - 1) adj[i][idx] = True return adj def get_attention_score(self, i): att = self.attention.get_attention_score(i, self.adj) return att def get_share_para(self): dic1 = dict(self.embedding.named_parameters()) dic2 = dict(self.attention.named_parameters()) return dict(dic1, **dic2)