class BaseAgent(ABC): def __init__(self, env, test_env, log_dir, num_steps=100000, batch_size=64, memory_size=1000000, gamma=0.99, multi_step=1, target_entropy_ratio=0.98, start_steps=20000, update_interval=4, target_update_interval=8000, use_per=False, num_eval_steps=125000, max_episode_steps=27000, log_interval=10, eval_interval=1000, device='cuda:0', seed=0): super().__init__() self.env = env self.test_env = test_env # Set seed. torch.manual_seed(seed) np.random.seed(seed) self.env.seed(seed) self.test_env.seed(2**31 - 1 - seed) # torch.backends.cudnn.deterministic = True # It harms a performance. # torch.backends.cudnn.benchmark = False # It harms a performance. self.device = torch.device( device if torch.cuda.is_available() else "cpu") # LazyMemory efficiently stores FrameStacked states. if use_per: beta_steps = (num_steps - start_steps) / update_interval self.memory = LazyPrioritizedMultiStepMemory( capacity=memory_size, state_shape=self.env.observation_space.shape, device=self.device, gamma=gamma, multi_step=multi_step, beta_steps=beta_steps) else: self.memory = LazyMultiStepMemory( capacity=memory_size, state_shape=self.env.observation_space.shape, device=self.device, gamma=gamma, multi_step=multi_step) self.log_dir = log_dir self.model_dir = os.path.join(log_dir, 'model') self.summary_dir = os.path.join(log_dir, 'summary') if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) self.writer = SummaryWriter(log_dir=self.summary_dir) self.train_return = RunningMeanStats(log_interval) self.steps = 0 self.learning_steps = 0 self.episodes = 0 self.best_eval_score = -np.inf self.num_steps = num_steps self.batch_size = batch_size self.gamma_n = gamma**multi_step self.start_steps = start_steps self.update_interval = update_interval self.target_update_interval = target_update_interval self.use_per = use_per self.num_eval_steps = num_eval_steps self.max_episode_steps = max_episode_steps self.log_interval = log_interval self.eval_interval = eval_interval def run(self): while True: self.train_episode() if self.steps > self.num_steps: break def is_update(self): return self.steps % self.update_interval == 0\ and self.steps >= self.start_steps @abstractmethod def explore(self, state): pass @abstractmethod def exploit(self, state): pass @abstractmethod def update_target(self): pass @abstractmethod def calc_current_q(self, states, actions, rewards, next_states, dones): pass @abstractmethod def calc_target_q(self, states, actions, rewards, next_states, dones): pass @abstractmethod def calc_critic_loss(self, batch, weights): pass @abstractmethod def calc_policy_loss(self, batch, weights): pass @abstractmethod def calc_entropy_loss(self, entropies, weights): pass def train_episode(self): self.episodes += 1 episode_return = 0. episode_steps = 0 done = False state = self.env.reset() while (not done) and episode_steps <= self.max_episode_steps: if self.start_steps > self.steps: action = self.env.action_space.sample() else: action = self.explore(state) next_state, reward, done, _ = self.env.step(action) # Clip reward to [-1.0, 1.0]. clipped_reward = max(min(reward, 1.0), -1.0) # To calculate efficiently, set priority=max_priority here. self.memory.append(state, action, clipped_reward, next_state, done) self.steps += 1 episode_steps += 1 episode_return += reward state = next_state if self.is_update(): self.learn() if self.steps % self.target_update_interval == 0: self.update_target() if self.steps % self.eval_interval == 0: self.evaluate() self.save_models(os.path.join(self.model_dir, 'final')) # We log running mean of training rewards. self.train_return.append(episode_return) if self.episodes % self.log_interval == 0: self.writer.add_scalar('reward/train', self.train_return.get(), self.steps) print(f'Episode: {self.episodes:<4} ' f'Episode steps: {episode_steps:<4} ' f'Return: {episode_return:<5.1f}') def learn(self): assert hasattr(self, 'q1_optim') and hasattr(self, 'q2_optim') and\ hasattr(self, 'policy_optim') and hasattr(self, 'alpha_optim') self.learning_steps += 1 if self.use_per: batch, weights = self.memory.sample(self.batch_size) else: batch = self.memory.sample(self.batch_size) # Set priority weights to 1 when we don't use PER. weights = 1. q1_loss, q2_loss, errors, mean_q1, mean_q2 = \ self.calc_critic_loss(batch, weights) policy_loss, entropies = self.calc_policy_loss(batch, weights) entropy_loss = self.calc_entropy_loss(entropies, weights) update_params(self.q1_optim, q1_loss) update_params(self.q2_optim, q2_loss) update_params(self.policy_optim, policy_loss) update_params(self.alpha_optim, entropy_loss) self.alpha = self.log_alpha.exp() if self.use_per: self.memory.update_priority(errors) if self.learning_steps % self.log_interval == 0: self.writer.add_scalar('loss/Q1', q1_loss.detach().item(), self.learning_steps) self.writer.add_scalar('loss/Q2', q2_loss.detach().item(), self.learning_steps) self.writer.add_scalar('loss/policy', policy_loss.detach().item(), self.learning_steps) self.writer.add_scalar('loss/alpha', entropy_loss.detach().item(), self.learning_steps) self.writer.add_scalar('stats/alpha', self.alpha.detach().item(), self.learning_steps) self.writer.add_scalar('stats/mean_Q1', mean_q1, self.learning_steps) self.writer.add_scalar('stats/mean_Q2', mean_q2, self.learning_steps) self.writer.add_scalar('stats/entropy', entropies.detach().mean().item(), self.learning_steps) def evaluate(self): num_episodes = 0 num_steps = 0 total_return = 0.0 while True: state = self.test_env.reset() episode_steps = 0 episode_return = 0.0 done = False while (not done) and episode_steps <= self.max_episode_steps: action = self.exploit(state) next_state, reward, done, _ = self.test_env.step(action) num_steps += 1 episode_steps += 1 episode_return += reward state = next_state num_episodes += 1 total_return += episode_return if num_steps > self.num_eval_steps: break mean_return = total_return / num_episodes if mean_return > self.best_eval_score: self.best_eval_score = mean_return self.save_models(os.path.join(self.model_dir, 'best')) self.writer.add_scalar('reward/test', mean_return, self.steps) print('-' * 60) print(f'Num steps: {self.steps:<5} ' f'return: {mean_return:<5.1f}') print('-' * 60) @abstractmethod def save_models(self, save_dir): if not os.path.exists(save_dir): os.makedirs(save_dir) def __del__(self): self.env.close() self.test_env.close() self.writer.close()
class BaseAgent(ABC): def __init__(self, env, test_env, log_dir, num_steps=100000, batch_size=64, memory_size=1000000, gamma=0.99, multi_step=1, target_entropy_ratio=0.98, start_steps=20000, update_interval=4, target_update_interval=8000, use_per=False, num_eval_steps=50000, max_episode_steps=10000, log_interval=10, eval_interval=500, cuda=True, seed=0): super().__init__() self.env = env self.test_env = test_env # Set seed. torch.manual_seed(seed) np.random.seed(seed) self.env.seed(seed) self.test_env.seed(2**31 - 1 - seed) # torch.backends.cudnn.deterministic = True # It harms a performance. # torch.backends.cudnn.benchmark = False # It harms a performance. self.device = torch.device( "cuda" if cuda and torch.cuda.is_available() else "cpu") # LazyMemory efficiently stores FrameStacked states. # if use_per: # beta_steps = (num_steps - start_steps) / update_interval # self.memory = LazyPrioritizedMultiStepMemory( # capacity=memory_size, # state_shape=self.env.observation_space.shape, # device=self.device, gamma=gamma, multi_step=multi_step, # beta_steps=beta_steps) # else: # self.memory = LazyMultiStepMemory( # capacity=memory_size, # state_shape=self.env.observation_space.shape, # device=self.device, gamma=gamma, multi_step=multi_step) self.memory = RecurrentMemory( capacity=memory_size, state_shape=self.env.observation_space.shape, device=self.device, sequen_len=6) self.log_dir = log_dir self.model_dir = os.path.join(log_dir, 'model') self.summary_dir = os.path.join(log_dir, 'summary') if not os.path.exists(self.model_dir): os.makedirs(self.model_dir) if not os.path.exists(self.summary_dir): os.makedirs(self.summary_dir) self.writer = SummaryWriter(log_dir=self.summary_dir) self.train_return = RunningMeanStats(log_interval) self.steps = 0 self.learning_steps = 0 self.episodes = 0 self.best_eval_score = -np.inf self.num_steps = num_steps self.batch_size = batch_size self.gamma_n = gamma**multi_step self.start_steps = start_steps self.update_interval = update_interval self.target_update_interval = target_update_interval self.use_per = use_per self.num_eval_steps = num_eval_steps self.max_episode_steps = max_episode_steps self.log_interval = log_interval self.eval_interval = eval_interval self._states = self.env.reset() self.num_feats = self.env.observation_space.shape[0] self.sequence_length = 6 def run(self): while True: self.train_episode() if self.steps > self.num_steps: break def is_update(self): return self.steps % self.update_interval == 0\ and self.steps >= self.start_steps @abstractmethod def explore(self, state): pass @abstractmethod def exploit(self, state): pass @abstractmethod def update_target(self): pass @abstractmethod def calc_current_q(self, states, actions, rewards, next_states, dones): pass @abstractmethod def calc_target_q(self, states, actions, rewards, next_states, dones): pass @abstractmethod def calc_critic_loss(self, batch, weights): pass @abstractmethod def calc_policy_loss(self, batch, weights): pass @abstractmethod def calc_entropy_loss(self, entropies, weights): pass def train_episode(self): self.episodes += 1 episode_return = 0. episode_steps = 0 done = False state = self.env.reset() while (not done) and episode_steps <= self.max_episode_steps: if self.start_steps > self.steps: action = self.env.action_space.sample() else: action = self.explore(state) next_state, reward, done, _ = self.env.step(action) # Clip reward to [-1.0, 1.0]. #clipped_reward = max(min(reward, 1.0), -1.0) # To calculate efficiently, set priority=max_priority here. self.memory.push(state, action, reward, next_state, done) self.steps += 1 episode_steps += 1 episode_return += reward state = next_state if self.is_update(): self.learn() if self.steps % self.target_update_interval == 0: self.update_target() ######################## if self.steps % self.eval_interval == 0: self.evaluate() self.save_models(os.path.join(self.model_dir, 'final')) # We log running mean of training rewards. self.train_return.append(episode_return) if self.episodes % self.log_interval == 0: self.writer.add_scalar('reward/train', self.train_return.get(), self.steps) print(f'Episode: {self.episodes:<4} ' f'Episode steps: {episode_steps:<4} ' f'Return: {episode_return:<5.1f}') def prep_minibatch(self): transitions = self.memory.sample(self.batch_size) batch_state, batch_action, batch_reward, batch_next_state = zip( *transitions) shape = (self.batch_size, self.sequence_length) + self.num_feats batch_state = torch.tensor(batch_state, device=self.device, dtype=torch.float).view(shape) batch_action = torch.tensor(batch_action, device=self.device, dtype=torch.long).view( self.batch_size, self.sequence_length, -1) batch_reward = torch.tensor(batch_reward, device=self.device, dtype=torch.float).view( self.batch_size, self.sequence_length) #get set of next states for end of each sequence batch_next_state = tuple([ batch_next_state[i] for i in range(len(batch_next_state)) if (i + 1) % (self.sequence_length) == 0 ]) non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch_next_state)), device=self.device, dtype=torch.uint8) try: #sometimes all next states are false, especially with nstep returns non_final_next_states = torch.tensor( [s for s in batch_next_state if s is not None], device=self.device, dtype=torch.float).unsqueeze(dim=1) non_final_next_states = torch.cat( [batch_state[non_final_mask, 1:, :], non_final_next_states], dim=1) empty_next_state_values = False except: empty_next_state_values = True return batch_state, batch_action, batch_reward, non_final_next_states, non_final_mask, empty_next_state_values def learn(self): assert hasattr(self, 'q1_optim') and hasattr(self, 'q2_optim') and\ hasattr(self, 'policy_optim') and hasattr(self, 'alpha_optim') self.learning_steps += 1 # if self.use_per: # batch, weights = self.memory.sample(self.batch_size) # else: # batch = self.memory.sample(self.batch_size) # # Set priority weights to 1 when we don't use PER. # weights = 1. batch = self.prep_minibatch() weights = 1. q1_loss, q2_loss, errors, mean_q1, mean_q2 = \ self.calc_critic_loss(batch, weights) policy_loss, entropies = self.calc_policy_loss(batch, weights) entropy_loss = self.calc_entropy_loss(entropies, weights) update_params(self.q1_optim, q1_loss) update_params(self.q2_optim, q2_loss) update_params(self.policy_optim, policy_loss) update_params(self.alpha_optim, entropy_loss) self.alpha = self.log_alpha.exp() if self.use_per: self.memory.update_priority(errors) if self.learning_steps % self.log_interval == 0: self.writer.add_scalar('loss/Q1', q1_loss.detach().item(), self.learning_steps) self.writer.add_scalar('loss/Q2', q2_loss.detach().item(), self.learning_steps) self.writer.add_scalar('loss/policy', policy_loss.detach().item(), self.learning_steps) self.writer.add_scalar('loss/alpha', entropy_loss.detach().item(), self.learning_steps) self.writer.add_scalar('stats/alpha', self.alpha.detach().item(), self.learning_steps) self.writer.add_scalar('stats/mean_Q1', mean_q1, self.learning_steps) self.writer.add_scalar('stats/mean_Q2', mean_q2, self.learning_steps) self.writer.add_scalar('stats/entropy', entropies.detach().mean().item(), self.learning_steps) def evaluate(self): num_episodes = 0 num_steps = 0 total_return = 0.0 while True: state = self.test_env.reset() episode_steps = 0 episode_return = 0.0 done = False state_memory = state.reshape(1, -1) # batch = 1, seq_len, features while (not done) and episode_steps <= self.max_episode_steps: #print(episode_steps ) action = self.exploit(state_memory) #print(action) next_state, reward, done, _ = self.test_env.step(action) num_steps += 1 episode_steps += 1 episode_return += reward state = next_state.reshape(1, -1) state_memory = np.insert(state_memory, state_memory.shape[0], values=state, axis=0) if episode_steps >= 6: state_memory = np.delete(state_memory, 0, axis=0) num_episodes += 1 total_return += episode_return if num_steps > self.num_eval_steps: break mean_return = total_return / num_episodes if mean_return > self.best_eval_score: self.best_eval_score = mean_return self.save_models(os.path.join(self.model_dir, 'best')) self.writer.add_scalar('reward/test', mean_return, self.steps) print('-' * 60) print(f'Num steps: {self.steps:<5} ' f'return: {mean_return:<5.1f}') print('-' * 60) @abstractmethod def save_models(self, save_dir): if not os.path.exists(save_dir): os.makedirs(save_dir) def __del__(self): self.env.close() self.test_env.close() self.writer.close()