Пример #1
	def __init__(self, args):
		self.n_actions = args.n_actions
		self.n_agents = args.n_agents
		self.state_shape = args.state_shape
		self.obs_shape = args.obs_shape
		actor_input_shape = self.obs_shape  # actor net input dimension is the same of the rnn of vdn and qmix
		critic_input_shape = self._get_critic_input_shape()  # dimensions of inpput of critic network

		# input dimension for rnn according to the params
		if args.last_action:
		    actor_input_shape += self.n_actions
		if args.reuse_network:
		    actor_input_shape += self.n_agents

		self.args = args

		if self.args.alg == 'coma':
			# each agent selects the action net to output the probs corresponding the its actions; when using this prob, softmax needs to be recalculated
			self.eval_rnn = RNN(actor_input_shape, args)
			print("COMA alg initialized")
		elif self.args.alg == 'coma+commnet':
			self.eval_rnn = CommNet(actor_input_shape, args)
			print("COMA+COMMNET initialized")
			raise Exception("No such algorithm!")

		# gets joint q value of all the exe actions of the current agent
		# then uses this q value and the prob of the actor net to get the advantage
		self.eval_critic = ComaCritic(critic_input_shape, self.args)
		self.target_critic = ComaCritic(critic_input_shape, self.args)

		self.model_dir = args.model_dir + '/' + args.alg

		if self.args.load_model:
		    if os.path.exists(self.model_dir + '/rnn_params.pkl'):
		        path_rnn = self.model_dir + '/rnn_params.pkl'
		        path_coma = self.model_dir + '/critic_params.pkl'
		        print('Successfully load the model: {} and {}'.format(path_rnn, path_coma))
		    	raise Exception("No such model!")

		# make params of the target network the same

		self.rnn_parameters = list(self.eval_rnn.parameters())
		self.critic_parameters = list(self.eval_critic.parameters())

		if args.optimizer == "RMS":
		    self.critic_optimizer = torch.optim.RMSprop(self.critic_parameters, lr=args.lr_critic)
		    self.rnn_optimizer = torch.optim.RMSprop(self.rnn_parameters, lr=args.lr_actor)

		self.args = args
		self.eval_hidden = None
Пример #2
    def create_critic_network(self, name):
        inputs = tf.placeholder(tf.float32,
                                shape=(None, NUM_AGENTS, VECTOR_OBS_LEN),
        action = tf.placeholder(tf.float32,
                                shape=(None, NUM_AGENTS, OUTPUT_LEN),

        out = CommNet.critic_build_network(name, inputs, action)
        return inputs, action, out
Пример #3
class COMA:
	def __init__(self, args):
		self.n_actions = args.n_actions
		self.n_agents = args.n_agents
		self.state_shape = args.state_shape
		self.obs_shape = args.obs_shape
		actor_input_shape = self.obs_shape  # actor net input dimension is the same of the rnn of vdn and qmix
		critic_input_shape = self._get_critic_input_shape()  # dimensions of inpput of critic network

		# input dimension for rnn according to the params
		if args.last_action:
		    actor_input_shape += self.n_actions
		if args.reuse_network:
		    actor_input_shape += self.n_agents

		self.args = args

		if self.args.alg == 'coma':
			# each agent selects the action net to output the probs corresponding the its actions; when using this prob, softmax needs to be recalculated
			self.eval_rnn = RNN(actor_input_shape, args)
			print("COMA alg initialized")
		elif self.args.alg == 'coma+commnet':
			self.eval_rnn = CommNet(actor_input_shape, args)
			print("COMA+COMMNET initialized")
			raise Exception("No such algorithm!")

		# gets joint q value of all the exe actions of the current agent
		# then uses this q value and the prob of the actor net to get the advantage
		self.eval_critic = ComaCritic(critic_input_shape, self.args)
		self.target_critic = ComaCritic(critic_input_shape, self.args)

		self.model_dir = args.model_dir + '/' + args.alg

		if self.args.load_model:
		    if os.path.exists(self.model_dir + '/rnn_params.pkl'):
		        path_rnn = self.model_dir + '/rnn_params.pkl'
		        path_coma = self.model_dir + '/critic_params.pkl'
		        print('Successfully load the model: {} and {}'.format(path_rnn, path_coma))
		    	raise Exception("No such model!")

		# make params of the target network the same

		self.rnn_parameters = list(self.eval_rnn.parameters())
		self.critic_parameters = list(self.eval_critic.parameters())

		if args.optimizer == "RMS":
		    self.critic_optimizer = torch.optim.RMSprop(self.critic_parameters, lr=args.lr_critic)
		    self.rnn_optimizer = torch.optim.RMSprop(self.rnn_parameters, lr=args.lr_actor)

		self.args = args
		self.eval_hidden = None

	def _get_critic_input_shape(self):
		# state
		input_shape = self.state_shape

		input_shape += self.obs_shape

		# agent_id
		input_shape += self.n_agents

		# curr and previous (*2) actions of all the agents 
		input_shape += self.n_actions * self.n_agents * 2

		return input_shape

	def learn(self, batch, max_episode_len, train_step, epsilon):
		episode_num = batch['obs'].shape[0]  # gets number of episode batches in batch

		#convert data in batch to tensor
		for key in batch.keys():  
		    if key == 'actions':
		        batch[key] = torch.tensor(batch[key], dtype=torch.long)
		        batch[key] = torch.tensor(batch[key], dtype=torch.float32)

		# coma doesnt use relay buffer, so next actions not needed
		actions, reward, avail_actions, terminated = batch['actions'], batch['reward'],  batch['avail_actions'], \

		# used to set the td error of the filled experiments to 0, not to affect learning
		mask = (1 - batch["padded"].float()).repeat(1, 1, self.n_agents)

		# calculate each agents q value based oon experience to follow the new critic net
		# then calculate prob of execution of each action to get the advantage and update the actor
		q_values = self._train_critic(batch, max_episode_len, train_step)  # train critic net and get q value of all actions of each agent
		action_prob = self._get_action_prob(batch, max_episode_len, epsilon)  # prob of all actions of each agent

		q_taken = torch.gather(q_values, dim=3, index=actions).squeeze(3)
		pi_taken = torch.gather(action_prob, dim=3, index=actions).squeeze(3)  # prob of the selected action of each agent
		pi_taken[mask == 0] = 1.0  # becuase we want to take logarithms, for the filled experiences the probs are 0 so let them become 1
		log_pi_taken = torch.log(pi_taken)

		# calculate advantage: calculate baseline to compare the actions of each agent with a default actions
		baseline = (q_values * action_prob).sum(dim=3, keepdim=True).squeeze(3).detach()
		advantage = (q_taken - baseline).detach()
		loss = -((advantage * log_pi_taken) * mask).sum() / mask.sum()
		torch.nn.utils.clip_grad_norm_(self.rnn_parameters, self.args.grad_norm_clip)

	def _get_critic_inputs(self, batch, transition_idx, max_episode_len):
		# get experiences of this transition_idx on the episode
		obs, obs_next, state, state_next = batch['obs'][:, transition_idx], batch['obs_next'][:, transition_idx], \
											batch['state'][:, transition_idx], batch['state_next'][:, transition_idx]

		actions_onehot = batch['actions_onehot'][:, transition_idx]
		if transition_idx != max_episode_len - 1:
			actions_onehot_next = batch['actions_onehot'][:, transition_idx + 1]
			actions_onehot_next = torch.zeros(*actions_onehot.shape)

		# because all agents have the same state, s and s_next are 2-d, there is no n_agents dimension
		# so, s has to be converted to 3-d
		state = state.unsqueeze(1).expand(-1, self.n_agents, -1)
		state_next = state_next.unsqueeze(1).expand(-1, self.n_agents, -1)
		episode_num = obs.shape[0]

		# coma uses a centralised critic, i.e, it uses the same critic for all agents so the actions of each agent in the last dimension have to be changed to the actions of all the agents
		actions_onehot = actions_onehot.view((episode_num, 1, -1)).repeat(1, self.n_agents, 1)
		actions_onehot_next = actions_onehot_next.view((episode_num, 1, -1)).repeat(1, self.n_agents, 1)

		# if it is the first experience, let the previous action be a vector of zero
		if transition_idx == 0:
			actions_onehot_last = torch.zeros_like(actions_onehot)  # NOTE: zeros_like receives a tensor to make it into a matrix of zeros while zeros creates a matrix of zeros with the defined shape
			actions_onehot_last = batch['actions_onehot'][:, transition_idx - 1]
			actions_onehot_last = actions_onehot_last.view((episode_num, 1, -1)).repeat(1, self.n_agents, 1)

		inputs, inputs_next = [], []

		# add info to list


		# add last action

		# NOTE: for coma, the input is just the actions of the other agents and not the action of the current agent
		action_mask = (1 - torch.eye(self.n_agents))  # generate 2d diagonal matrix
		action_mask = action_mask.view(-1, 1).repeat(1, self.n_actions).view(self.n_agents, -1)
		inputs.append(actions_onehot * action_mask.unsqueeze(0))
		inputs_next.append(actions_onehot_next * action_mask.unsqueeze(0))

		# becuase of the input 3d data, each dimension represents (episode number, agent number, inputs dimension) 
		inputs.append(torch.eye(self.n_agents).unsqueeze(0).expand(episode_num, -1, -1))
		inputs_next.append(torch.eye(self.n_agents).unsqueeze(0).expand(episode_num, -1, -1))

		#to transform its dimensions from (episode_num, n_agents, inputs) three-dimensional to (episode_num * n_agents, inputs) two-dimensional
		inputs = torch.cat([x.reshape(episode_num * self.n_agents, -1) for x in inputs], dim=1)
		inputs_next = torch.cat([x.reshape(episode_num * self.n_agents, -1) for x in inputs_next], dim=1)

		return inputs, inputs_next

	def _get_q_values(self, batch, max_episode_len):
		episode_num = batch['obs'].shape[0]
		q_evals, q_targets = [], []
		for transition_idx in range(max_episode_len):
			inputs, inputs_next = self._get_critic_inputs(batch, transition_idx, max_episode_len)

			# The neural network inputs (episode_num * n_agents, inputs) two-dimensional data, and obtains (episode_num * n_agents, n_actions) two-dimensional data
			q_eval = self.eval_critic(inputs)
			q_target = self.target_critic(inputs_next)

			# change dimensions of the q value back to (ep_num, n_agents, n_actions)
			q_eval = q_eval.view(episode_num, self.n_agents, -1)
			q_target = q_target.view(episode_num, self.n_agents, -1)

		#The obtained q_evals and q_targets are a list, and the list contains max_episode_len arrays. The dimensions of the array are (episode number, n_agents, n_actions)
		# Convert the list into an array of (episode number, max_episode_len, n_agents, n_actions)
		q_evals = torch.stack(q_evals, dim=1)
		q_targets = torch.stack(q_targets, dim=1)

		return q_evals, q_targets

	def _get_actor_inputs(self, batch, transition_idx):
		# decentralised actor, decentralised execution; actor -> policy, maps states to actions
		# take the experience of the transition_idx on all the episodes
		obs, actions_onehot = batch['obs'][:, transition_idx], batch['actions_onehot'][:]
		episode_num = obs.shape[0]
		inputs = []

		if self.args.last_action:
			if transition_idx == 0:
				inputs.append(torch.zeros_like(actions_onehot[:, transition_idx]))
				inputs.append(actions_onehot[:, transition_idx - 1])
		if self.args.reuse_network:
			# same as above
			inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1))

		inputs = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1)

		return inputs

	def _get_action_prob(self, batch, max_episode_len, epsilon):
		episode_num = batch['obs'].shape[0]
		avail_actions = batch['avail_actions']  # coma doesnt need the target actor
		action_prob = []
		for transition_idx in range(max_episode_len):
			inputs = self._get_actor_inputs(batch, transition_idx)

			outputs, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden)
			outputs = outputs.view(episode_num, self.n_agents, -1)
			prob = torch.nn.functional.softmax(outputs, dim=-1)

		action_prob = torch.stack(action_prob, dim=1).cpu()

		action_num = avail_actions.sum(dim=-1, keepdim=True).float().repeat(1, 1, 1, avail_actions.shape[-1])  # number of actions that can be selected
		action_prob = ((1 - epsilon) * action_prob + torch.ones_like(action_prob) * epsilon / action_num)
		action_prob[avail_actions == 0] = 0.0

		# regularize probability of actions that cant be performed
		action_prob = action_prob / action_prob.sum(dim=-1, keepdim=True)

		# set to 0 again to avoid errors
		action_prob[avail_actions == 0] = 0.0

		return action_prob

	def init_hidden(self, episode_num):
		# initializes eval hidden for each agent
		self.eval_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim))

	def _train_critic(self, batch, max_episode_len, train_step):
		# centralised critic, centralised training, learns values for state action pairs
		actions, reward, avail_actions, terminated = batch['actions'], batch['reward'], batch['avail_actions'], batch['terminated']
		actions_next = actions[:, 1:]
		padded_actions_next = torch.zeros(*actions[:, -1].shape, dtype=torch.long).unsqueeze(1)
		actions_next = torch.cat((actions_next, padded_actions_next), dim=1)
		mask = (1 - batch['padded'].float()).repeat(1, 1, self.n_agents)  # set td error of the filed experiences to 0, not to affect learning

		q_evals, q_next_target = self._get_q_values(batch, max_episode_len)
		q_values = q_evals.clone()  # to return at the end to calculate advantage and update the actor

		# take q values corresponding to each agent action and remove last dim as it only has one value
		q_evals = torch.gather(q_evals, dim=3, index=actions).squeeze(3)
		q_next_target = torch.gather(q_next_target, dim=3, index=actions_next).squeeze(3)
		targets = td_lambda_target(batch, max_episode_len, q_next_target.cpu(), self.args)

		td_error = targets.detach() - q_evals
		masked_td_error = mask * td_error  # to erase filled experience

		loss = (masked_td_error ** 2).sum() / mask.sum()

		torch.nn.utils.clip_grad_norm_(self.critic_parameters, self.args.grad_norm_clip)

		if train_step > 0 and train_step % self.args.target_update_cycle == 0:

		return q_values

	def save_model(self, train_step):
		num = str(train_step // self.args.save_cycle)
		if not os.path.exists(self.model_dir):
		torch.save(self.eval_critic.state_dict(), self.model_dir + '/' + num + '_critic_params.pkl')
		torch.save(self.eval_rnn.state_dict(),  self.model_dir + '/' + num + '_rnn_params.pkl')
Пример #4
 def create_actor_network(self, name):
     inputs = tf.placeholder(tf.float32,
                             shape=(None, NUM_AGENTS, VECTOR_OBS_LEN),
     out = CommNet.actor_build_network(name, inputs)
     return inputs, out
Пример #5
	def __init__(self, args):
		self.n_actions = args.n_actions
		self.n_agents = args.n_agents
		self.state_shape = args.state_shape
		self.obs_shape = args.obs_shape
		input_shape = self.obs_shape

		# input dimension for rnn according to the params
		if args.last_action:
		    input_shape += self.n_actions
		if args.reuse_network:
		    input_shape += self.n_agents

		# changed this if/else block
		if args.alg == 'qmix':
			self.eval_rnn = RNN(input_shape, args)  # each agent picks a net of actions
			self.target_rnn = RNN(input_shape, args)
			print("QMIX initialized")
		elif args.alg == 'qmix+commnet':
			self.eval_comm = CommNet(input_shape, args) # communication network t be used in get_action_weights in agent.py
			self.target_comm = CommNet(input_shape, args)
			self.eval_rnn = RNN(input_shape, args)  # each agent picks a net of actions
			self.target_rnn = RNN(input_shape, args)
			#changed this
			self.criterion = nn.CrossEntropyLoss()
			print("QMIX+COMMNET initialized")
			raise Exception("No such algorithm")

		self.eval_qmix_net = QMixNet(args)  # netowrk that mixes up agents Q values 
		self.target_qmix_net = QMixNet(args)  # target network, as in DQN
		self.args = args

		self.model_dir = args.model_dir + '/' + args.alg

		if self.args.load_model:
		    if os.path.exists(self.model_dir + '/rnn_net_params.pkl'):
		        path_rnn = self.model_dir + '/rnn_net_params.pkl'
		        path_qmix = self.model_dir + '/qmix_net_params.pkl'
		        print('Successfully loaded the model: {} and {}'.format(path_rnn, path_qmix))
		    	raise Exception("No such model!")

		# make parameters of target and eval the same

		if args.alg == 'qmix+commnet':
			#changed added this line

		self.eval_parameters = list(self.eval_qmix_net.parameters()) + list(self.eval_rnn.parameters())

		# changed adde this
		if args.alg == 'qmix+commnet':
			self.eval_comm_parameters = list(self.eval_comm.parameters())

		if args.optimizer == "RMS":
		    self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr)
		    #changed added this
		    if args.alg == 'qmix+commnet':
		    	self.comm_optimizer = torch.optim.RMSprop(self.eval_comm_parameters, lr=args.lr)

		# during learning one should keep an eval_hidden and a target_hidden for each agent of each episode
		self.eval_hidden = None
		self.target_hidden = None
		#chnaged: added this line
		self.eval_comm_hidden = None
		self.target_comm_hidden = None
Пример #6
class QMix:
	def __init__(self, args):
		self.n_actions = args.n_actions
		self.n_agents = args.n_agents
		self.state_shape = args.state_shape
		self.obs_shape = args.obs_shape
		input_shape = self.obs_shape

		# input dimension for rnn according to the params
		if args.last_action:
		    input_shape += self.n_actions
		if args.reuse_network:
		    input_shape += self.n_agents

		# changed this if/else block
		if args.alg == 'qmix':
			self.eval_rnn = RNN(input_shape, args)  # each agent picks a net of actions
			self.target_rnn = RNN(input_shape, args)
			print("QMIX initialized")
		elif args.alg == 'qmix+commnet':
			self.eval_comm = CommNet(input_shape, args) # communication network t be used in get_action_weights in agent.py
			self.target_comm = CommNet(input_shape, args)
			self.eval_rnn = RNN(input_shape, args)  # each agent picks a net of actions
			self.target_rnn = RNN(input_shape, args)
			#changed this
			self.criterion = nn.CrossEntropyLoss()
			print("QMIX+COMMNET initialized")
			raise Exception("No such algorithm")

		self.eval_qmix_net = QMixNet(args)  # netowrk that mixes up agents Q values 
		self.target_qmix_net = QMixNet(args)  # target network, as in DQN
		self.args = args

		self.model_dir = args.model_dir + '/' + args.alg

		if self.args.load_model:
		    if os.path.exists(self.model_dir + '/rnn_net_params.pkl'):
		        path_rnn = self.model_dir + '/rnn_net_params.pkl'
		        path_qmix = self.model_dir + '/qmix_net_params.pkl'
		        print('Successfully loaded the model: {} and {}'.format(path_rnn, path_qmix))
		    	raise Exception("No such model!")

		# make parameters of target and eval the same

		if args.alg == 'qmix+commnet':
			#changed added this line

		self.eval_parameters = list(self.eval_qmix_net.parameters()) + list(self.eval_rnn.parameters())

		# changed adde this
		if args.alg == 'qmix+commnet':
			self.eval_comm_parameters = list(self.eval_comm.parameters())

		if args.optimizer == "RMS":
		    self.optimizer = torch.optim.RMSprop(self.eval_parameters, lr=args.lr)
		    #changed added this
		    if args.alg == 'qmix+commnet':
		    	self.comm_optimizer = torch.optim.RMSprop(self.eval_comm_parameters, lr=args.lr)

		# during learning one should keep an eval_hidden and a target_hidden for each agent of each episode
		self.eval_hidden = None
		self.target_hidden = None
		#chnaged: added this line
		self.eval_comm_hidden = None
		self.target_comm_hidden = None

	def learn(self, batch, max_episode_len, train_step, epsilon=None):
			batch: batch with episode batches from before to train the model
			max_episode_len: len of the longest episode batch in batch
			train_step: it is used to control and update the params of the target network


			the extracted data is 4D, with meanings 1-> n_episodes, 2-> n_transitions in the episode, 
			3-> data of multiple agents, 4-> obs dimensions
			hidden_state is related to the previous experience (RNN ?) so one cant randomly extract
			experience to learn, so multiple episodes are extracted at a time and then given to the
			nn one at a time   

		episode_num = batch['obs'].shape[0]  # gets number of episode batches in batch

		#convert data in batch to tensor
		for key in batch.keys():  
		    if key == 'actions':
		        batch[key] = torch.tensor(batch[key], dtype=torch.long)
		        batch[key] = torch.tensor(batch[key], dtype=torch.float32)

		state, state_next, actions, reward, avail_actions, avail_actions_next, terminated = batch['state'], batch['state_next'], \
																							batch['actions'], batch['reward'], \
																							batch['avail_actions'], batch['avail_actions_next'], \

		# used to set the td error of the filled experiments to 0, not to affect learning
		mask = 1 - batch["padded"].float()  

		# gets q value corresponding to each agent, dimensions are (episode_number, max_episode_len, n_agents, n_actions)
		q_evals, q_targets = self.get_q_values(batch, max_episode_len)

		#print(q_evals.shape, actions.shape)
		# get q value corresponding to each agent and remove last dim (3)
		q_evals = torch.gather(q_evals, dim=3, index=actions).squeeze(3)

		#if self.args.alg == 'qmix+commnet':
			# changed added this
			#q_evals = q_evals + weights

		#changed added this block
		#scalar_prod = 0
		#if self.args.alg == 'qmix+commnet':
			#q_evals_aux = q_evals.reshape(-1)
			#weights_aux = weights.reshape(-1)
			#scalar_prod = torch.dot(q_evals_aux, weights_aux)

		# get q value corresponding to each agent and remove last dim (3)
		#q_evals = torch.gather(q_evals, dim=3, index=actions).squeeze(3)

		# get real q_target
		# unavailable actions dont matter, low value
		q_targets[avail_actions_next == 0.0] = - 9999999
		q_targets = q_targets.max(dim=3)[0]

		# mixes values with qmix
		q_total_eval = self.eval_qmix_net(q_evals, state)
		q_total_target = self.target_qmix_net(q_targets, state_next)

		# gets the weights for the actions of each agent in communication changed
		# trains weights net here
		if self.args.alg == 'qmix+commnet':
			weights = self.train_weights(batch, max_episode_len, q_total_eval, q_total_target)  # changed added two last arguments so i can use the same loss for the weights
			#weights = self.train_weights(batch, max_episode_len)
			#weights = self._get_action_prob(batch, max_episode_len, self.args.epsilon)

		targets = reward + self.args.gamma * q_total_target * (1 - terminated)

		td_error = (q_total_eval - targets.detach())
		masked_td_error = mask * td_error 

		# there are still useless experiments, so the avg is according the number of real experiments
		#changed added this block
		if self.args.alg == 'qmix+commnet':
			loss = (masked_td_error ** 2).sum() / mask.sum()
			loss = (masked_td_error ** 2).sum() / mask.sum()
		#changed added this line
		#loss_comm = loss

		# changed added retain_graph arg
		torch.nn.utils.clip_grad_norm_(self.eval_parameters, self.args.grad_norm_clip)

		# changed added this
		'''if self.args.alg == 'qmix+commnet':
			torch.nn.utils.clip_grad_norm_(self.eval_comm_parameters, self.args.grad_norm_clip)

		# update target networks
		if train_step > 0 and train_step % self.args.target_update_cycle == 0:

	# changed added this function; changed the loss to get q_total values instead of the weights so we can train the nets to minimize the same loss
	def train_weights(self, batch, max_episode_len, q_total_eval, q_total_target):
		actions, avail_actions_next, reward, terminated = batch['actions'], batch['avail_actions_next'], batch['reward'], batch['terminated']

		mask = 1 - batch["padded"].float()  
		weights, weights_target = self.get_weights(batch, max_episode_len)

		weights = torch.gather(weights, dim=3, index=actions).squeeze(3)

		#weights_target[avail_actions_next == 0.0] = - 9999999
		weights_target = weights_target.max(dim=3)[0]
		targets = reward + self.args.gamma * weights_target * (1 - terminated)  # changed replaced weghts_target with q_total_target

		td_error = (weights - targets.detach())  # changed replaced weights with q_total_eval
		masked_td_error = mask * td_error 

		# there are still useless experiments, so the avg is according the number of real experiments
		loss = (masked_td_error ** 2).sum() / mask.sum()
		#changed added this line
		#loss_comm = loss

		torch.nn.utils.clip_grad_norm_(self.eval_comm_parameters, self.args.grad_norm_clip)

		return weights

	# changed: added this function
	# this should be used to get the weights for commnet
	def _get_action_prob(self, batch, max_episode_len, epsilon):
		episode_num = batch['obs'].shape[0]
		avail_actions = batch['avail_actions']  # coma doesnt need the target actor
		action_prob = []
		for transition_idx in range(max_episode_len):
			inputs = self._get_actor_inputs(batch, transition_idx)

			outputs, self.eval_comm_hidden = self.eval_comm(inputs, self.eval_comm_hidden)
			outputs = outputs.view(episode_num, self.n_agents, -1)
			prob = torch.nn.functional.softmax(outputs, dim=-1)

		action_prob = torch.stack(action_prob, dim=1).cpu()
		action_num = avail_actions.sum(dim=-1, keepdim=True).float().repeat(1, 1, 1, avail_actions.shape[-1])  # number of actions that can be selected
		action_prob = ((1 - epsilon) * action_prob + torch.ones_like(action_prob) * epsilon / action_num)
		action_prob[avail_actions == 0] = 0.0

		# regularize probability of actions that cant be performed
		action_prob = action_prob / action_prob.sum(dim=-1, keepdim=True)

		# set to 0 again to avoid errors
		action_prob[avail_actions == 0] = 0.0

		return action_prob

	# changed added this function
	def _get_actor_inputs(self, batch, transition_idx):
		# decentralised actor, decentralised execution; actor -> policy, maps states to actions
		# take the experience of the transition_idx on all the episodes
		obs, actions_onehot = batch['obs'][:, transition_idx], batch['actions_onehot'][:]
		episode_num = obs.shape[0]
		inputs = []

		if self.args.last_action:
			if transition_idx == 0:
				inputs.append(torch.zeros_like(actions_onehot[:, transition_idx]))
				inputs.append(actions_onehot[:, transition_idx - 1])
		if self.args.reuse_network:
			# same as above
			inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1))

		inputs = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1)

		return inputs

	def get_q_values(self, batch, max_episode_len):
		episode_num = batch['obs'].shape[0]  # gets number of episode batches in batch
		q_evals, q_targets = [], []
		for transition_idx in range(max_episode_len):
		    inputs, inputs_next = self._get_inputs(batch, transition_idx)  # add last action and agent_id to the obs
		    q_eval, self.eval_hidden = self.eval_rnn(inputs, self.eval_hidden)  # The input dimension is (40,96), and the resulting q_eval dimension is (40,n_actions)
		    q_target, self.target_hidden = self.target_rnn(inputs_next, self.target_hidden)

		    # Change the q_eval dimension back to (8, 5(n_agents), n_actions)
		    q_eval = q_eval.view(episode_num, self.n_agents, -1)
		    q_target = q_target.view(episode_num, self.n_agents, -1)

		q_eval and q_target are lists containing max_episode_len arrays with dimensions (episode_number, n_agents, n_actions)
		convert the lists into arrays of (episode_number, max_episode_len, n_agents, n_actions)

		q_evals = torch.stack(q_evals, dim=1)
		q_targets = torch.stack(q_targets, dim=1)
		return q_evals, q_targets

	# changed added this function
	def get_weights(self, batch, max_episode_len):
		episode_num = batch['obs'].shape[0]  # gets number of episode batches in batch
		q_evals, q_targets = [], []
		for transition_idx in range(max_episode_len):
		    inputs, inputs_next = self._get_inputs(batch, transition_idx)  # add last action and agent_id to the obs
		    q_eval, self.eval_comm_hidden = self.eval_comm(inputs, self.eval_comm_hidden)  # The input dimension is (40,96), and the resulting q_eval dimension is (40,n_actions)
		    q_target, self.target_comm_hidden = self.target_comm(inputs_next, self.target_comm_hidden)

		    # Change the q_eval dimension back to (8, 5(n_agents), n_actions)
		    q_eval = q_eval.view(episode_num, self.n_agents, -1)
		    q_target = q_target.view(episode_num, self.n_agents, -1)

		q_eval and q_target are lists containing max_episode_len arrays with dimensions (episode_number, n_agents, n_actions)
		convert the lists into arrays of (episode_number, max_episode_len, n_agents, n_actions)

		q_evals = torch.stack(q_evals, dim=1)
		q_targets = torch.stack(q_targets, dim=1)
		return q_evals, q_targets

	def _get_inputs(self, batch, transition_idx):
		# gets the experience of the transition_idx on all episodes, actions_onehot take all out because the last one is used TODO ?
		obs, obs_next, actions_onehot = batch['obs'][:, transition_idx], \
		                          batch['obs_next'][:, transition_idx], batch['actions_onehot'][:]
		episode_num = obs.shape[0]
		inputs, inputs_next = [], []

		# adds last action and agent number to obs
		if self.args.last_action:
		    if transition_idx == 0:  # if it is the first transition, let the previous action be a 0 vector
		        inputs.append(torch.zeros_like(actions_onehot[:, transition_idx]))
		        inputs.append(actions_onehot[:, transition_idx - 1])
		    inputs_next.append(actions_onehot[:, transition_idx])

		if self.args.reuse_network:  # uses one network for all agents TODO: see differences
			TODO: read and refrase
			Because the current obs 3D data, each dimension 
			represents (episode number, agent number, obs dimension), add the corresponding vector directly on dim_1
			That is, for example, adding (1, 0, 0, 0, 0) to agent_0 means the number 0 in 5 agents. 
			And the data of agent_0 happens to be in the 0th row, then you need to add 
			The agent number happens to be an identity matrix, that is, the diagonal is 1, and the rest are 0

			inputs.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1))
			inputs_next.append(torch.eye(self.args.n_agents).unsqueeze(0).expand(episode_num, -1, -1))

		TODO: read and refrase
		It is necessary to put three of the obs together, and the data of episode_num episodes 
		and self.args.n_agents agents are combined into 40 (40,96) data.

		Because all agents here share a neural network, each data is brought 
		with its own number, so it is still its own data

		inputs = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs], dim=1)
		inputs_next = torch.cat([x.reshape(episode_num * self.args.n_agents, -1) for x in inputs_next], dim=1)

		# TODO note from github: Check if inputs_next is equivalent to moving inputs backward
		return inputs, inputs_next

	def init_hidden(self, episode_num):
		# initializes eval_hidden and target_hidden for each agent of each episode, as in DQN there is a net and a target net to stabilize learning

		self.eval_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim))
		self.target_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim))

		# changed added this if clause
		if self.args.alg == 'qmix+commnet':
			self.eval_comm_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim))
			self.target_comm_hidden = torch.zeros((episode_num, self.n_agents, self.args.rnn_hidden_dim))

	def save_model(self, train_step):
		num = str(train_step // self.args.save_cycle)
		if not os.path.exists(self.model_dir):
		torch.save(self.eval_qmix_net.state_dict(), self.model_dir + '/' + num + '_qmix_net_params.pkl')
		torch.save(self.eval_rnn.state_dict(),  self.model_dir + '/' + num + '_rnn_net_params.pkl')
Пример #7
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        input_shape = self.obs_shape

        # input dimension for rnn according to the params
        if args.last_action:
            input_shape += self.n_actions
        if args.reuse_network:
            input_shape += self.n_agents

        # todo see this, changed added this if else block
        if args.alg == 'vdn':
            self.eval_rnn = RNN(input_shape,
                                args)  # each agent picks a net of actions
            self.target_rnn = RNN(input_shape, args)
            print('VDN alg initialized')
        elif args.alg == 'vdn+commnet':
            self.eval_rnn = CommNet(input_shape, args)
            self.target_rnn = CommNet(input_shape, args)
            print('VDN+COMMNET initialized')

        self.eval_vdn_net = VDNNet()  # netowrk that adds up agents Q values
        self.target_vdn_net = VDNNet()  # target network, as in DQN
        self.args = args

        # cuda
        if self.args.cuda:

        self.model_dir = args.model_dir + '/' + args.alg

        if self.args.load_model:
            if os.path.exists(self.model_dir + '/rnn_net_params.pkl'):
                path_rnn = self.model_dir + '/rnn_net_params.pkl'
                path_vdn = self.model_dir + '/vdn_net_params.pkl'
                print('Successfully load the model: {} and {}'.format(
                    path_rnn, path_vdn))
                raise Exception("No such model!")

        # make parameters of target and eval the same

        self.eval_parameters = list(self.eval_vdn_net.parameters()) + list(
        if args.optimizer == "RMS":
            self.optimizer = torch.optim.RMSprop(self.eval_parameters,

        # during learning one should keep an eval_hidden and a target_hidden for each agent of each episode
        self.eval_hidden = None
        self.target_hidden = None
Пример #8
class VDN:
    def __init__(self, args):
        self.n_actions = args.n_actions
        self.n_agents = args.n_agents
        self.state_shape = args.state_shape
        self.obs_shape = args.obs_shape
        input_shape = self.obs_shape

        # input dimension for rnn according to the params
        if args.last_action:
            input_shape += self.n_actions
        if args.reuse_network:
            input_shape += self.n_agents

        # todo see this, changed added this if else block
        if args.alg == 'vdn':
            self.eval_rnn = RNN(input_shape,
                                args)  # each agent picks a net of actions
            self.target_rnn = RNN(input_shape, args)
            print('VDN alg initialized')
        elif args.alg == 'vdn+commnet':
            self.eval_rnn = CommNet(input_shape, args)
            self.target_rnn = CommNet(input_shape, args)
            print('VDN+COMMNET initialized')

        self.eval_vdn_net = VDNNet()  # netowrk that adds up agents Q values
        self.target_vdn_net = VDNNet()  # target network, as in DQN
        self.args = args

        # cuda
        if self.args.cuda:

        self.model_dir = args.model_dir + '/' + args.alg

        if self.args.load_model:
            if os.path.exists(self.model_dir + '/rnn_net_params.pkl'):
                path_rnn = self.model_dir + '/rnn_net_params.pkl'
                path_vdn = self.model_dir + '/vdn_net_params.pkl'
                print('Successfully load the model: {} and {}'.format(
                    path_rnn, path_vdn))
                raise Exception("No such model!")

        # make parameters of target and eval the same

        self.eval_parameters = list(self.eval_vdn_net.parameters()) + list(
        if args.optimizer == "RMS":
            self.optimizer = torch.optim.RMSprop(self.eval_parameters,

        # during learning one should keep an eval_hidden and a target_hidden for each agent of each episode
        self.eval_hidden = None
        self.target_hidden = None

    def learn(self, batch, max_episode_len, train_step, epsilon=None):
			batch: batch with episode batches from before training the model
			max_episode_len: len of the longest episode batch in batch
			train_step: it is used to control and update the params of the target network


			the extracted data is 4D, with meanings 1-> n_episodes, 2-> n_transitions in the episode, 
			3-> data of multiple agents, 4-> obs dimensions
			hidden_state is related to the previous experience (RNN ?) so one cant randomly extract
			experience to learn, so multiple episodes are extracted at a time and then given to the
			nn one at a time   

        episode_num = batch['obs'].shape[
            0]  # gets number of episode batches in batch

        #convert data in batch to tensor
        for key in batch.keys():
            if key == 'actions':
                batch[key] = torch.tensor(batch[key], dtype=torch.long)
                batch[key] = torch.tensor(batch[key], dtype=torch.float32)

        actions, reward, avail_actions, avail_actions_next, terminated = batch['actions'], batch['reward'],  batch['avail_actions'], \
                                                  batch['avail_actions_next'], batch['terminated']

        # used to set the td error of the filled experiments to 0, not to affect learning
        mask = 1 - batch["padded"].float()

        # cuda
        if self.args.cuda:
            actions = actions.cuda()
            reward = reward.cuda()
            mask = mask.cuda()
            terminated = terminated.cuda()

        # gets q value corresponding to each agent, dimensions are (episode_number, max_episode_len, n_agents, n_actions)
        q_evals, q_targets = self.get_q_values(batch, max_episode_len)

        #print(q_evals.shape, actions.shape)
        # get q value corresponding to each agents action and remove last dim (3)
        q_evals = torch.gather(q_evals, dim=3, index=actions).squeeze(3)

        # get real q_target
        # unavailable actions dont matter, low value
        q_targets[avail_actions_next == 0.0] = -9999999
        q_targets = q_targets.max(dim=3)[0]

        # mixes values with vdn
        q_total_eval = self.eval_vdn_net(q_evals)
        q_total_target = self.target_vdn_net(q_targets)

        targets = reward + self.args.gamma * q_total_target * (1 - terminated)

        td_error = targets.detach() - q_total_eval
        masked_td_error = mask * td_error

        # there are still useless experiments, so the avg is according the number of real experiments
        loss = (masked_td_error**2).sum() / mask.sum()


        # update target networks
        if train_step > 0 and train_step % self.args.target_update_cycle == 0:

    def get_q_values(self, batch, max_episode_len):
        episode_num = batch['obs'].shape[
            0]  # gets number of episode batches in batch
        q_evals, q_targets = [], []
        for transition_idx in range(max_episode_len):
            inputs, inputs_next = self._get_inputs(
                transition_idx)  # add last action and agent_id to the obs

            # cuda
            if self.args.cuda:
                inputs = inputs.cuda()
                inputs_next = inputs_next.cuda()
                self.eval_hidden = self.eval_hidden.cuda()
                self.target_hidden = self.target_hidden.cuda()

            q_eval, self.eval_hidden = self.eval_rnn(
                inputs, self.eval_hidden
            )  # The input dimension is (40,96), and the resulting q_eval dimension is (40,n_actions)
            q_target, self.target_hidden = self.target_rnn(
                inputs_next, self.target_hidden)

            # Change the q_eval dimension back to (8, 5(n_agents), n_actions)
            q_eval = q_eval.view(episode_num, self.n_agents, -1)
            q_target = q_target.view(episode_num, self.n_agents, -1)
		q_eval and q_target are lists containing max_episode_len arrays with dimensions (episode_number, n_agents, n_actions)
		convert the lists into arrays of (episode_number, max_episode_len, n_agents, n_actions)

        q_evals = torch.stack(q_evals, dim=1)
        q_targets = torch.stack(q_targets, dim=1)
        return q_evals, q_targets

    def _get_inputs(self, batch, transition_idx):
        # gets the experience of the transition_idx on all episodes, actions_onehot take all out because the last one is used TODO ?
        obs, obs_next, actions_onehot = batch['obs'][:, transition_idx], \
                                  batch['obs_next'][:, transition_idx], batch['actions_onehot'][:]
        episode_num = obs.shape[0]
        inputs, inputs_next = [], []

        # adds last action and agent number to obs
        if self.args.last_action:
            if transition_idx == 0:  # if it is the first transition, let the previous action be a 0 vector
                    torch.zeros_like(actions_onehot[:, transition_idx]))
                inputs.append(actions_onehot[:, transition_idx - 1])
            inputs_next.append(actions_onehot[:, transition_idx])

        if self.args.reuse_network:  # uses one network for all agents TODO: see differences
			TODO: read and refrase
			Because the current obs 3D data, each dimension 
			represents (episode number, agent number, obs dimension), add the corresponding vector directly on dim_1
			That is, for example, adding (1, 0, 0, 0, 0) to agent_0 means the number 0 in 5 agents. 
			And the data of agent_0 happens to be in the 0th row, then you need to add 
			The agent number happens to be an identity matrix, that is, the diagonal is 1, and the rest are 0

                    episode_num, -1, -1))
                    episode_num, -1, -1))
		TODO: read and refrase
		It is necessary to put three of the obs together, and the data of episode_num episodes 
		and self.args.n_agents agents are combined into 40 (40,96) data.

		Because all agents here share a neural network, each data is brought 
		with its own number, so it is still its own data

        inputs = torch.cat(
            [x.reshape(episode_num * self.args.n_agents, -1) for x in inputs],
        inputs_next = torch.cat([
            x.reshape(episode_num * self.args.n_agents, -1)
            for x in inputs_next

        # TODO note from github: Check if inputs_next is equivalent to moving inputs backward
        return inputs, inputs_next

    def init_hidden(self, episode_num):
        # initializes eval_hidden and target_hidden for each agent of each episode, as in DQN there is a net and a target net to stabilize learning

        self.eval_hidden = torch.zeros(
            (episode_num, self.n_agents, self.args.rnn_hidden_dim))
        self.target_hidden = torch.zeros(
            (episode_num, self.n_agents, self.args.rnn_hidden_dim))

    def save_model(self, train_step):
        num = str(train_step // self.args.save_cycle)
        if not os.path.exists(self.model_dir):
                   self.model_dir + '/' + num + '_vdn_net_params.pkl')
                   self.model_dir + '/' + num + '_rnn_net_params.pkl')