class BiCNet(): def __init__(self, s_dim, a_dim, n_agents, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.n_agents = n_agents self.device = 'cuda' if self.config.use_cuda else 'cpu' # Networks self.policy = Actor(s_dim, a_dim, n_agents) self.policy_target = Actor(s_dim, a_dim, n_agents) self.critic = Critic(s_dim, a_dim, n_agents) self.critic_target = Critic(s_dim, a_dim, n_agents) if self.config.use_cuda: self.policy.cuda() self.policy_target.cuda() self.critic.cuda() self.critic_target.cuda() self.policy_optimizer = torch.optim.Adam(self.policy.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) hard_update(self.policy, self.policy_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay self.c_loss = None self.a_loss = None self.action_log = list() def choose_action(self, obs, noisy=True): obs = torch.Tensor([obs]).to(self.device) action = self.policy(obs).cpu().detach().numpy()[0] self.action_log.append(action) if noisy: for agent_idx in range(self.n_agents): pass # action[agent_idx] += self.epsilon * self.random_process.sample() self.epsilon -= self.depsilon self.epsilon = max(self.epsilon, 0.001) np.clip(action, -1., 1.) return action def reset(self): self.random_process.reset_states() self.action_log.clear() def prep_train(self): self.policy.train() self.critic.train() self.policy_target.train() self.critic_target.train() def prep_eval(self): self.policy.eval() self.critic.eval() self.policy_target.eval() self.critic_target.eval() def random_action(self): return np.random.uniform(low=-1, high=1, size=(self.n_agents, 2)) def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = torch.Tensor(state_batches).to(self.device) action_batches = torch.Tensor(action_batches).to(self.device) reward_batches = torch.Tensor(reward_batches).reshape( self.config.batch_size, self.n_agents, 1).to(self.device) next_state_batches = torch.Tensor(next_state_batches).to(self.device) done_batches = torch.Tensor( (done_batches == False) * 1).reshape(self.config.batch_size, self.n_agents, 1).to(self.device) target_next_actions = self.policy_target.forward(next_state_batches) target_next_q = self.critic_target.forward(next_state_batches, target_next_actions) main_q = self.critic(state_batches, action_batches) ''' How to concat each agent's Q value? ''' #target_next_q = target_next_q #main_q = main_q.mean(dim=1) ''' Reward Norm ''' # reward_batches = (reward_batches - reward_batches.mean(dim=0)) / reward_batches.std(dim=0) / 1024 # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines.detach()) loss_critic.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 0.5) self.critic_optimizer.step() # Actor Loss self.policy.zero_grad() clear_action_batches = self.policy.forward(state_batches) loss_actor = -self.critic.forward(state_batches, clear_action_batches).mean() loss_actor += (clear_action_batches**2).mean() * 1e-3 loss_actor.backward() torch.nn.utils.clip_grad_norm_(self.policy.parameters(), 0.5) self.policy_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.policy, self.policy_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def get_loss(self): return self.c_loss, self.a_loss def get_action_std(self): return np.array(self.action_log).std(axis=-1).mean()
class Preyer: def __init__(self, s_dim, a_dim, **kwargs): self.s_dim = s_dim self.a_dim = a_dim self.config = kwargs['config'] self.device = 'cuda' if self.config.use_cuda else 'cpu' self.actor = Actor(s_dim, a_dim) self.actor_target = Actor(s_dim, a_dim) self.critic = Critic(s_dim, a_dim, 1) self.critic_target = Critic(s_dim, a_dim, 1) self.actor_optimizer = torch.optim.Adam(self.actor.parameters(), lr=self.config.a_lr) self.critic_optimizer = torch.optim.Adam(self.critic.parameters(), lr=self.config.c_lr) self.c_loss = 0 self.a_loss = 0 if self.config.use_cuda: self.actor.cuda() self.actor_target.cuda() self.critic.cuda() self.critic_target.cuda() hard_update(self.actor, self.actor_target) hard_update(self.critic, self.critic_target) self.random_process = OrnsteinUhlenbeckProcess( size=self.a_dim, theta=self.config.ou_theta, mu=self.config.ou_mu, sigma=self.config.ou_sigma) self.replay_buffer = list() self.epsilon = 1. self.depsilon = self.epsilon / self.config.epsilon_decay def memory(self, s, a, r, s_, done): self.replay_buffer.append((s, a, r, s_, done)) if len(self.replay_buffer) >= self.config.memory_length: self.replay_buffer.pop(0) def get_batches(self): experiences = random.sample(self.replay_buffer, self.config.batch_size) state_batches = np.array([_[0] for _ in experiences]) action_batches = np.array([_[1] for _ in experiences]) reward_batches = np.array([_[2] for _ in experiences]) next_state_batches = np.array([_[3] for _ in experiences]) done_batches = np.array([_[4] for _ in experiences]) return state_batches, action_batches, reward_batches, next_state_batches, done_batches def choose_action(self, s, noisy=True): if self.config.use_cuda: s = Variable(torch.cuda.FloatTensor(s)) else: s = Variable(torch.FloatTensor(s)) a = self.actor.forward(s).cpu().detach().numpy() if noisy: a += max(self.epsilon, 0.001) * self.random_process.sample() self.epsilon -= self.depsilon a = np.clip(a, -1., 1.) return np.array([a]) def random_action(self): action = np.random.uniform(low=-1., high=1., size=(1, self.a_dim)) return action def reset(self): self.random_process.reset_states() def train(self): state_batches, action_batches, reward_batches, next_state_batches, done_batches = self.get_batches( ) state_batches = Variable(torch.Tensor(state_batches).to(self.device)) action_batches = Variable( torch.Tensor(action_batches).reshape(-1, 1).to(self.device)) reward_batches = Variable( torch.Tensor(reward_batches).reshape(-1, 1).to(self.device)) next_state_batches = Variable( torch.Tensor(next_state_batches).to(self.device)) done_batches = Variable( torch.Tensor( (done_batches == False) * 1).reshape(-1, 1).to(self.device)) target_next_actions = self.actor_target.forward( next_state_batches).detach() target_next_q = self.critic_target.forward( next_state_batches, target_next_actions).detach() main_q = self.critic(state_batches, action_batches) # Critic Loss self.critic.zero_grad() baselines = reward_batches + done_batches * self.config.gamma * target_next_q loss_critic = torch.nn.MSELoss()(main_q, baselines) loss_critic.backward() self.critic_optimizer.step() # Actor Loss self.actor.zero_grad() clear_action_batches = self.actor.forward(state_batches) loss_actor = ( -self.critic.forward(state_batches, clear_action_batches)).mean() loss_actor.backward() self.actor_optimizer.step() # This is for logging self.c_loss = loss_critic.item() self.a_loss = loss_actor.item() soft_update(self.actor, self.actor_target, self.config.tau) soft_update(self.critic, self.critic_target, self.config.tau) def getLoss(self): return self.c_loss, self.a_loss