team = 'HELIOS19' elif team == 'robocin': team = 'RoboCIn' agent = DDPGAgent(DDPG, False, team=team, port=6000, num_agents=int(num_agents), num_ops=int(num_agents)) processes = [] agent.ddpg.actor.share_memory() agent.ddpg.target_actor.share_memory() agent.ddpg.critic.share_memory() agent.ddpg.target_critic.share_memory() if agent.gen_mem: memories = [MemoryDeque(MEM_SIZE) for _ in range(agent.num_agents)] else: memories = agent.ddpg.memory agent.ddpg.memory = None for rank in range(agent.num_agents): p = mp.Process(target=run, args=(agent.port, agent.team, agent.actions, agent.rewards, rank, agent.ddpg, memories[rank], agent.test, episodes)) p.start() processes.append(p) for p in processes: p.join() agent.save_model(bye=True) exit(1)
class DDPGTrain(BaseTrain): def __init__(self, static_policy=False, env=None, config=None): super(DDPGTrain, self).__init__(config=config, env=env) self.priority_replay = config.USE_PRIORITY_REPLAY self.gamma = config.GAMMA self.lr = config.LR self.experience_replay_size = config.EXP_REPLAY_SIZE self.batch_size = config.BATCH_SIZE self.learn_start = config.LEARN_START self.priority_beta_start = config.PRIORITY_BETA_START self.priority_beta_frames = config.PRIORITY_BETA_FRAMES self.priority_alpha = config.PRIORITY_ALPHA self.tau = config.tau self.static_policy = static_policy self.num_feats = env.observation_space.shape self.env = env # self.writer = SummaryWriter( # f'./saved_agents/DDPG/agent_{self.env.getUnum()}') self.declare_networks() actor_learning_rate = 1e-4 critic_learning_rate = 1e-3 self.num_actor_update_iteration = 0 self.num_critic_update_iteration = 0 self.critic_criterion = nn.MSELoss() self.actor_optimizer = optim.Adam( self.actor.parameters(), lr=actor_learning_rate) self.critic_optimizer = optim.Adam( self.critic.parameters(), lr=critic_learning_rate) self.actor_loss = self.critic_loss = list() # move to correct device self.actor = self.actor.to(self.device) self.target_actor = self.target_actor.to(self.device) self.critic = self.critic.to(self.device) self.target_critic = self.target_critic.to(self.device) if self.static_policy: self.actor.eval() self.target_actor.eval() self.critic.eval() self.target_critic.eval() self.update_count = 0 self.declare_memory() self.nsteps = config.N_STEPS self.nstep_buffer = [] def save_w(self, path_models=('./saved_agents/actor.dump', './saved_agents/critic.dump'), path_optims=('./saved_agents/actor_optim.dump', './saved_agents/critic_optim.dump')): torch.save(self.actor.state_dict(), path_models[0]) torch.save(self.critic.state_dict(), path_models[1]) torch.save(self.actor_optimizer.state_dict(), path_optims[0]) torch.save(self.critic_optimizer.state_dict(), path_optims[1]) def load_w(self, path_models=('./saved_agents/actor.dump', './saved_agents/critic.dump'), path_optims=('./saved_agents/actor_optim.dump', './saved_agents/critic_optim.dump')): fname_actor = path_models[0] fname_critic = path_models[1] fname_actor_optim = path_optims[0] fname_critic_optim = path_optims[1] if os.path.isfile(fname_actor): self.actor.load_state_dict(torch.load(fname_actor, map_location=self.device)) for target_param, param in zip(self.target_actor.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) if os.path.isfile(fname_critic): self.critic.load_state_dict(torch.load(fname_critic, map_location=self.device)) for target_param, param in zip(self.target_critic.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) if os.path.isfile(fname_actor_optim): self.actor_optimizer.load_state_dict( torch.load(fname_actor_optim, map_location=self.device) ) if os.path.isfile(fname_critic_optim): self.critic_optimizer.load_state_dict(torch.load( fname_critic_optim, map_location=self.device) ) def declare_networks(self): pass def declare_memory(self): self.memory = MemoryDeque(self.experience_replay_size) def append_to_replay(self, s, a, r, s_, d): self.memory.store((s, a, r, s_, d)) def update(self): # faster state, next_state, action, reward, done = self.memory.sample( self.batch_size) reward = reward.reshape(-1, 1) done = done.reshape(-1, 1) num_feat = state.shape[1] * state.shape[2] state = Variable(torch.FloatTensor( np.float32(state))).view(self.batch_size, num_feat) next_state = Variable(torch.FloatTensor( np.float32(next_state))).view(self.batch_size, num_feat) action = Variable(torch.FloatTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) # Compute the target Q value acts = self.target_actor(next_state) target_Q = self.target_critic(next_state, acts) target_Q = reward + (self.gamma * target_Q * (1 - done)).detach() # Get current Q estimate current_Q = self.critic(state, action) # Compute critic loss critic_loss = F.mse_loss(current_Q, target_Q) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # self.writer.add_scalar('Loss/ddpg/critic_loss', critic_loss, # global_step=self.num_critic_update_iteration) # self.critic_loss.append(critic_loss) # Compute actor loss acts = self.actor(state) actor_loss = -self.critic(state, acts).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # self.writer.add_scalar( # 'Loss/ddpg/actor_loss', actor_loss, global_step=self.num_actor_update_iteration) # self.actor_loss.append(actor_loss) self.num_actor_update_iteration += 1 self.num_critic_update_iteration += 1 # Update the frozen target models if self.num_critic_update_iteration % 1000 == 0: for param, target_param in zip(self.critic.parameters(), self.target_critic.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data) if self.num_actor_update_iteration % 1000 == 0: for param, target_param in zip(self.actor.parameters(), self.target_actor.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data) def get_action(self, s): with torch.no_grad(): num_feat = s.shape[0] * s.shape[1] state = Variable(torch.from_numpy(s).float().unsqueeze(0)) state = state.view(1, num_feat) action = self.actor.forward(state) action = action.detach().cpu().numpy()[0, 0] return action
def declare_memory(self): if not self.priority_replay: self.memory = MemoryDeque(self.experience_replay_size) else: self.memory = Memory(int(self.experience_replay_size))
def declare_memory(self): self.memory = MemoryDeque(self.experience_replay_size)
class DuelingTrain(BaseTrain): def __init__(self, static_policy=False, env=None, config=None): super(DuelingTrain, self).__init__(config=config, env=env) self.noisy = config.USE_NOISY_NETS self.priority_replay = config.USE_PRIORITY_REPLAY self.gamma = config.GAMMA self.lr = config.LR self.target_net_update_freq = config.TARGET_NET_UPDATE_FREQ self.experience_replay_size = config.EXP_REPLAY_SIZE self.batch_size = config.BATCH_SIZE self.update_freq = config.UPDATE_FREQ self.sigma_init = config.SIGMA_INIT self.priority_beta_start = config.PRIORITY_BETA_START self.priority_beta_frames = config.PRIORITY_BETA_FRAMES self.priority_alpha = config.PRIORITY_ALPHA self.tau = config.tau self.static_policy = static_policy self.num_actions = env.action_space.n self.env = env self.declare_networks() self.writer = SummaryWriter( f'./saved_agents/DDQN/agent_{self.env.getUnum()}') self.losses = list() self.target_model.load_state_dict(self.model.state_dict()) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) # move to correct device self.model = self.model.to(self.device) self.target_model.to(self.device) if self.static_policy: self.model.eval() self.target_model.eval() self.update_count = 0 self.update_iteration = 0 self.declare_memory() self.nsteps = config.N_STEPS self.nstep_buffer = [] def load_w(self, model_path=None, optim_path=None): if model_path is None: fname_model = "./saved_agents/model.dump" else: fname_model = model_path if optim_path is None: fname_optim = "./saved_agents/optim.dump" else: fname_optim = optim_path if os.path.isfile(fname_model): self.model.load_state_dict(torch.load( fname_model, map_location=self.device)) self.target_model.load_state_dict(self.model.state_dict()) if os.path.isfile(fname_optim): self.optimizer.load_state_dict( torch.load(fname_optim, map_location=self.device)) def declare_networks(self): pass def declare_memory(self): self.memory = MemoryDeque(self.experience_replay_size) def append_to_replay(self, s, a, r, s_, d): self.memory.store((s, a, r, s_, d)) def compute_td_loss(self): state, next_state, action, reward, done = self.memory.sample( self.batch_size) num_feat = state.shape[1] * state.shape[2] state = Variable(torch.FloatTensor( np.float32(state))).view(64, num_feat) next_state = Variable(torch.FloatTensor( np.float32(next_state))).view(64, num_feat) action = Variable(torch.LongTensor(action)) reward = Variable(torch.FloatTensor(reward)) done = Variable(torch.FloatTensor(done)) q_values = self.model(state) next_q_values = self.target_model(next_state) q_value = q_values.gather(1, action.unsqueeze(1)).squeeze(1) next_q_value = next_q_values.max(1)[0] expected_q_value = reward + self.gamma * next_q_value * (1 - done) loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optimizer.zero_grad() loss.backward() self.optimizer.step() return loss def update(self, frame=0): loss = self.compute_td_loss() unum = self.env.getUnum() self.writer.add_scalar( f'Loss/loss_{unum}', loss, global_step=self.update_iteration) self.losses.append(loss) self.update_iteration += 1 self.update_target_model() def get_action(self, s, eps=0.1): # faster with torch.no_grad(): if np.random.uniform() >= eps or self.static_policy or self.noisy: X = torch.tensor([s], device=self.device, dtype=torch.float) X = X.view(1, -1) out = self.model(X) maxout = out.argmax() return maxout.item() else: return np.random.randint(0, self.num_actions) def update_target_model(self): self.update_count += 1 self.update_count = self.update_count % self.target_net_update_freq if self.update_count == 0: for param, target_param in zip(self.model.parameters(), self.target_model.parameters()): target_param.data.copy_( self.tau * param.data + (1 - self.tau) * target_param.data)