def train_epoch(self): play_time_start = time.time() with torch.no_grad(): if self.is_rnn: batch_dict = self.play_steps_rnn() else: batch_dict = self.play_steps() play_time_end = time.time() update_time_start = time.time() rnn_masks = batch_dict.get('rnn_masks', None) self.curr_frames = batch_dict.pop('played_frames') self.prepare_dataset(batch_dict) self.algo_observer.after_steps() a_losses = [] c_losses = [] entropies = [] kls = [] if self.has_central_value: self.train_central_value() if self.is_rnn: print('non masked rnn obs ratio: ',rnn_masks.sum().item() / (rnn_masks.nelement())) for _ in range(0, self.mini_epochs_num): ep_kls = [] for i in range(len(self.dataset)): a_loss, c_loss, entropy, kl, last_lr, lr_mul = self.train_actor_critic(self.dataset[i]) a_losses.append(a_loss) c_losses.append(c_loss) ep_kls.append(kl) entropies.append(entropy) av_kls = torch_ext.mean_list(ep_kls) if self.multi_gpu: av_kls = self.hvd.average_value(av_kls, 'ep_kls') self.last_lr, self.entropy_coef = self.scheduler.update(self.last_lr, self.entropy_coef, self.epoch_num, 0, av_kls.item()) self.update_lr(self.last_lr) kls.append(av_kls) update_time_end = time.time() play_time = play_time_end - play_time_start update_time = update_time_end - update_time_start total_time = update_time_end - play_time_start return play_time, update_time, total_time, a_losses, c_losses, entropies, kls, last_lr, lr_mul
def train(self): self.init_tensors() self.algo_observer.after_init(self) self.last_mean_rewards = -100500 total_time = 0 # rep_count = 0 self.frame = 0 self.obs = self.env_reset() while True: self.epoch_num += 1 step_time, play_time, update_time, epoch_total_time, actor_losses, entropies, alphas, alpha_losses, critic1_losses, critic2_losses = self.train_epoch( ) total_time += epoch_total_time scaled_time = epoch_total_time scaled_play_time = play_time curr_frames = self.num_frames_per_epoch self.frame += curr_frames frame = self.frame #TODO: Fix frame # print(frame) if self.print_stats: fps_step = curr_frames / scaled_play_time fps_total = curr_frames / scaled_time print(f'fps step: {fps_step:.1f} fps total: {fps_total:.1f}') self.writer.add_scalar('performance/step_inference_rl_update_fps', curr_frames / scaled_time, frame) self.writer.add_scalar('performance/step_inference_fps', curr_frames / scaled_play_time, frame) self.writer.add_scalar('performance/step_fps', curr_frames / step_time, frame) self.writer.add_scalar('performance/rl_update_time', update_time, frame) self.writer.add_scalar('performance/step_inference_time', play_time, frame) self.writer.add_scalar('performance/step_time', step_time, frame) if self.epoch_num >= self.num_seed_steps: self.writer.add_scalar( 'losses/a_loss', torch_ext.mean_list(actor_losses).item(), frame) self.writer.add_scalar( 'losses/c1_loss', torch_ext.mean_list(critic1_losses).item(), frame) self.writer.add_scalar( 'losses/c2_loss', torch_ext.mean_list(critic2_losses).item(), frame) self.writer.add_scalar('losses/entropy', torch_ext.mean_list(entropies).item(), frame) if alpha_losses[0] is not None: self.writer.add_scalar( 'losses/alpha_loss', torch_ext.mean_list(alpha_losses).item(), frame) self.writer.add_scalar('info/alpha', torch_ext.mean_list(alphas).item(), frame) self.writer.add_scalar('info/epochs', self.epoch_num, frame) self.algo_observer.after_print_stats(frame, self.epoch_num, total_time) if self.game_rewards.current_size > 0: mean_rewards = self.game_rewards.get_mean() mean_lengths = self.game_lengths.get_mean() self.writer.add_scalar('rewards/step', mean_rewards, frame) # self.writer.add_scalar('rewards/iter', mean_rewards, epoch_num) self.writer.add_scalar('rewards/time', mean_rewards, total_time) self.writer.add_scalar('episode_lengths/step', mean_lengths, frame) # self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num) self.writer.add_scalar('episode_lengths/time', mean_lengths, total_time) if mean_rewards > self.last_mean_rewards and self.epoch_num >= self.save_best_after: print('saving next best rewards: ', mean_rewards) self.last_mean_rewards = mean_rewards self.save("./nn/" + self.config['name']) if self.last_mean_rewards > self.config.get( 'score_to_win', float('inf')): print('Network won!') self.save("./nn/" + self.config['name'] + 'ep=' + str(self.epoch_num) + 'rew=' + str(mean_rewards)) return self.last_mean_rewards, self.epoch_num if self.epoch_num > self.max_epochs: self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(self.epoch_num) + 'rew=' + str(mean_rewards)) print('MAX EPOCHS NUM!') return self.last_mean_rewards, self.epoch_num update_time = 0
def train(self): self.init_tensors() self.last_mean_rewards = -100500 start_time = time.time() total_time = 0 rep_count = 0 self.frame = 0 self.obs = self.env_reset() self.curr_frames = self.batch_size_envs if self.multi_gpu: self.hvd.setup_algo(self) while True: epoch_num = self.update_epoch() play_time, update_time, sum_time, a_losses, c_losses, b_losses, entropies, kls, last_lr, lr_mul = self.train_epoch() total_time += sum_time frame = self.frame if self.multi_gpu: self.hvd.sync_stats(self) if self.rank == 0: scaled_time = sum_time #self.num_agents * sum_time scaled_play_time = play_time #self.num_agents * play_time curr_frames = self.curr_frames self.frame += curr_frames if self.print_stats: fps_step = curr_frames / scaled_play_time fps_total = curr_frames / scaled_time print(f'fps step: {fps_step:.1f} fps total: {fps_total:.1f}') self.writer.add_scalar('performance/total_fps', curr_frames / scaled_time, frame) self.writer.add_scalar('performance/step_fps', curr_frames / scaled_play_time, frame) self.writer.add_scalar('performance/update_time', update_time, frame) self.writer.add_scalar('performance/play_time', play_time, frame) self.writer.add_scalar('losses/a_loss', torch_ext.mean_list(a_losses).item(), frame) self.writer.add_scalar('losses/c_loss', torch_ext.mean_list(c_losses).item(), frame) if len(b_losses) > 0: self.writer.add_scalar('losses/bounds_loss', torch_ext.mean_list(b_losses).item(), frame) self.writer.add_scalar('losses/entropy', torch_ext.mean_list(entropies).item(), frame) self.writer.add_scalar('info/last_lr', last_lr * lr_mul, frame) self.writer.add_scalar('info/lr_mul', lr_mul, frame) self.writer.add_scalar('info/e_clip', self.e_clip * lr_mul, frame) self.writer.add_scalar('info/kl', torch_ext.mean_list(kls).item(), frame) self.writer.add_scalar('info/epochs', epoch_num, frame) self.algo_observer.after_print_stats(frame, epoch_num, total_time) if self.game_rewards.current_size > 0: mean_rewards = self.game_rewards.get_mean() mean_lengths = self.game_lengths.get_mean() for i in range(self.value_size): self.writer.add_scalar('rewards{0}/frame'.format(i), mean_rewards[i], frame) self.writer.add_scalar('rewards{0}/iter'.format(i), mean_rewards[i], epoch_num) self.writer.add_scalar('rewards{0}/time'.format(i), mean_rewards[i], total_time) self.writer.add_scalar('episode_lengths/frame', mean_lengths, frame) self.writer.add_scalar('episode_lengths/iter', mean_lengths, epoch_num) if self.has_self_play_config: self.self_play_manager.update(self) if self.save_freq > 0: if (epoch_num % self.save_freq == 0) and (mean_rewards <= self.last_mean_rewards): self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards)) if mean_rewards[0] > self.last_mean_rewards and epoch_num >= self.save_best_after: print('saving next best rewards: ', mean_rewards) self.last_mean_rewards = mean_rewards[0] self.save("./nn/" + self.config['name']) if self.last_mean_rewards > self.config['score_to_win']: print('Network won!') self.save("./nn/" + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards)) return self.last_mean_rewards, epoch_num if epoch_num > self.max_epochs: self.save("./nn/" + 'last_' + self.config['name'] + 'ep=' + str(epoch_num) + 'rew=' + str(mean_rewards)) print('MAX EPOCHS NUM!') return self.last_mean_rewards, epoch_num update_time = 0
def train_epoch(self): play_time_start = time.time() with torch.no_grad(): if self.is_rnn: batch_dict = self.play_steps_rnn() else: batch_dict = self.play_steps() play_time_end = time.time() update_time_start = time.time() rnn_masks = batch_dict.get('rnn_masks', None) self.curr_frames = batch_dict.pop('played_frames') self.prepare_dataset(batch_dict) self.algo_observer.after_steps() if self.has_central_value: self.train_central_value() a_losses = [] c_losses = [] b_losses = [] entropies = [] kls = [] if self.is_rnn: frames_mask_ratio = rnn_masks.sum().item() / (rnn_masks.nelement()) print(frames_mask_ratio) for _ in range(0, self.mini_epochs_num): ep_kls = [] for i in range(len(self.dataset)): a_loss, c_loss, entropy, kl, last_lr, lr_mul, cmu, csigma, b_loss = self.train_actor_critic(self.dataset[i]) a_losses.append(a_loss) c_losses.append(c_loss) ep_kls.append(kl) entropies.append(entropy) if self.bounds_loss_coef is not None: b_losses.append(b_loss) self.dataset.update_mu_sigma(cmu, csigma) if self.schedule_type == 'legacy': if self.multi_gpu: kl = self.hvd.average_value(kl, 'ep_kls') self.last_lr, self.entropy_coef = self.scheduler.update(self.last_lr, self.entropy_coef, self.epoch_num, 0,kl.item()) self.update_lr(self.last_lr) av_kls = torch_ext.mean_list(ep_kls) if self.schedule_type == 'standard': if self.multi_gpu: av_kls = self.hvd.average_value(av_kls, 'ep_kls') self.last_lr, self.entropy_coef = self.scheduler.update(self.last_lr, self.entropy_coef, self.epoch_num, 0,av_kls.item()) self.update_lr(self.last_lr) kls.append(av_kls) if self.schedule_type == 'standard_epoch': if self.multi_gpu: av_kls = self.hvd.average_value(torch_ext.mean_list(kls), 'ep_kls') self.last_lr, self.entropy_coef = self.scheduler.update(self.last_lr, self.entropy_coef, self.epoch_num, 0,av_kls.item()) self.update_lr(self.last_lr) if self.has_phasic_policy_gradients: self.ppg_aux_loss.train_net(self) update_time_end = time.time() play_time = play_time_end - play_time_start update_time = update_time_end - update_time_start total_time = update_time_end - play_time_start return play_time, update_time, total_time, a_losses, c_losses, b_losses, entropies, kls, last_lr, lr_mul