args.model, envs[0].observation_space, args.pretrained_model) else: obss_preprocessor = utils.ObssPreprocessor(args.model, envs[0].observation_space, args.pretrained_model) # Define actor-critic model acmodel = utils.load_model(args.model, raise_not_found=False) acmodel = ACModelGNN(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch) obss_preprocessor.vocab.save() utils.save_model(acmodel, args.model) # if torch.cuda.is_available(): # acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgoGNN(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward)
def __init__( self, args, ): self.args = args utils.seed(self.args.seed) # args.env is a list when training on multiple environments if getattr(args, 'multi_env', None): self.env = [gym.make(item) for item in args.multi_env] self.train_demos = [] for demos, episodes in zip(args.multi_demos, args.multi_episodes): demos_path = utils.get_demos_path(demos, None, None, valid=False) logger.info('loading {} of {} demos'.format(episodes, demos)) train_demos = utils.load_demos(demos_path) logger.info('loaded demos') if episodes > len(train_demos): raise ValueError( "there are only {} train demos in {}".format( len(train_demos), demos)) self.train_demos.extend(train_demos[:episodes]) logger.info('So far, {} demos loaded'.format( len(self.train_demos))) self.val_demos = [] for demos, episodes in zip(args.multi_demos, [args.val_episodes] * len(args.multi_demos)): demos_path_valid = utils.get_demos_path(demos, None, None, valid=True) logger.info('loading {} of {} valid demos'.format( episodes, demos)) valid_demos = utils.load_demos(demos_path_valid) logger.info('loaded demos') if episodes > len(valid_demos): logger.info( 'Using all the available {} demos to evaluate valid. accuracy' .format(len(valid_demos))) self.val_demos.extend(valid_demos[:episodes]) logger.info('So far, {} valid demos loaded'.format( len(self.val_demos))) logger.info('Loaded all demos') observation_space = self.env[0].observation_space action_space = self.env[0].action_space else: self.env = gym.make(self.args.env) demos_path = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=False) demos_path_valid = utils.get_demos_path(args.demos, args.env, args.demos_origin, valid=True) logger.info('loading demos') self.train_demos = utils.load_demos(demos_path) logger.info('loaded demos') if args.episodes: if args.episodes > len(self.train_demos): raise ValueError("there are only {} train demos".format( len(self.train_demos))) self.train_demos = self.train_demos[:args.episodes] self.val_demos = utils.load_demos(demos_path_valid) if args.val_episodes > len(self.val_demos): logger.info( 'Using all the available {} demos to evaluate valid. accuracy' .format(len(self.val_demos))) self.val_demos = self.val_demos[:self.args.val_episodes] observation_space = self.env.observation_space action_space = self.env.action_space self.obss_preprocessor = utils.ObssPreprocessor( args.model, observation_space, getattr(self.args, 'pretrained_model', None)) # Define actor-critic model self.acmodel = utils.load_model(args.model, raise_not_found=False) if self.acmodel is None: if getattr(self.args, 'pretrained_model', None): logger.info("Loading pretrained model") self.acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: logger.info('Creating new model') self.acmodel = ACModel( self.obss_preprocessor.obs_space, action_space, args.image_dim, args.memory_dim, args.instr_dim, not self.args.no_desc, self.args.instr_arch, not self.args.no_mem, self.args.arch, random_shuffled=self.args.random_shuffle, instr_sents=self.env.n_floor_colors, enable_instr=self.args.enable_instr, instr_only=self.args.instr_only) self.obss_preprocessor.vocab.save() utils.save_model(self.acmodel, args.model) self.acmodel.train() if torch.cuda.is_available(): self.acmodel.cuda() self.optimizer = torch.optim.Adam(self.acmodel.parameters(), self.args.lr, eps=self.args.optim_eps) self.scheduler = torch.optim.lr_scheduler.StepLR(self.optimizer, step_size=100, gamma=0.9) self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu")
"U {} | F {:06} | FPS {:04.0f} | D {} | rR:x̄σmM {: .2f} {: .2f} {: .2f} {: .2f} | F:x̄σmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {: .3f} | vL {:.3f}" .format(i, num_frames, fps, duration, *rreturn_per_episode.values(), *num_frames_per_episode.values(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"])) if args.tb: writer.add_scalar("frames", num_frames, i) writer.add_scalar("FPS", fps, i) writer.add_scalar("duration", total_ellapsed_time, i) for key, value in return_per_episode.items(): writer.add_scalar("return_" + key, value, i) for key, value in rreturn_per_episode.items(): writer.add_scalar("rreturn_" + key, value, i) for key, value in num_frames_per_episode.items(): writer.add_scalar("num_frames_" + key, value, i) writer.add_scalar("entropy", logs["entropy"], i) writer.add_scalar("value", logs["value"], i) writer.add_scalar("policy_loss", logs["policy_loss"], i) writer.add_scalar("value_loss", logs["value_loss"], i) # Save obss preprocessor vocabulary and model if args.save_interval > 0 and i % args.save_interval == 0: obss_preprocessor.vocab.save() if torch.cuda.is_available(): acmodel.cpu() utils.save_model(acmodel, model_name) logger.info("Model is saved.") if torch.cuda.is_available(): acmodel.cuda()
advice_end_index = advice_start_index + env.action_space.n + 1 acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, envs[0], args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, advice_dim=128, advice_start_index=advice_start_index, advice_end_index=advice_end_index) obss_preprocessor.vocab.save() utils.save_model(acmodel, args.model) if torch.cuda.is_available(): acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = PPOAlgo(acmodel, envs, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo))
if model is None: if pretrained[m]: models.append(utils.load_model(pretrained[m], raise_not_found=True)) else: models.append( ACModel(obss_preprocessor.obs_spaces[m], envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, args.enc_dim, args.dec_dim, args.len_message, args.num_symbols)) else: models.append(model) for m, model in enumerate(models): obss_preprocessor.vocabs[m].save() utils.save_model(model, model_names[m]) if torch.cuda.is_available(): for model in models: model.cuda() # Define actor--critic algorithm. reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward algo = PPOAlgo(penv, models, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward, not args.no_comm, args.conventional) for m, model_name in enumerate(model_names):
def train(self, train_demos, writer, csv_writer, status_path, header, reset_status=False): # Load the status def initial_status(): return {'i': 0, 'num_frames': 0, 'patience': 0} status = initial_status() if os.path.exists(status_path) and not reset_status: with open(status_path, 'r') as src: status = json.load(src) elif not os.path.exists(os.path.dirname(status_path)): # Ensure that the status directory exists os.makedirs(os.path.dirname(status_path)) # If the batch size is larger than the number of demos, we need to lower the batch size if self.args.batch_size > len(train_demos): self.args.batch_size = len(train_demos) logger.info("Batch size too high. Setting it to the number of train demos ({})".format(len(train_demos))) # Model saved initially to avoid "Model not found Exception" during first validation step utils.save_model(self.acmodel, self.args.model) # best mean return to keep track of performance on validation set best_success_rate, patience, i = 0, 0, 0 total_start_time = time.time() while status['i'] < getattr(self.args, 'epochs', int(1e9)): if 'patience' not in status: # if for some reason you're finetuining with IL an RL pretrained agent status['patience'] = 0 # Do not learn if using a pre-trained model that already lost patience if status['patience'] > self.args.patience: break if status['num_frames'] > self.args.frames: break status['i'] += 1 i = status['i'] update_start_time = time.time() # Learning rate scheduler self.scheduler.step() log = self.run_epoch_recurrence(train_demos, is_training=True) total_len = sum([len(item[3]) for item in train_demos]) status['num_frames'] += total_len update_end_time = time.time() # Print logs if status['i'] % self.args.log_interval == 0: total_ellapsed_time = int(time.time() - total_start_time) fps = total_len / (update_end_time - update_start_time) duration = datetime.timedelta(seconds=total_ellapsed_time) for key in log: log[key] = np.mean(log[key]) train_data = [status['i'], status['num_frames'], fps, total_ellapsed_time, log["entropy"], log["policy_loss"], log["accuracy"]] logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | H {:.3f} | pL {: .3f} | A {: .3f}".format(*train_data)) # Log the gathered data only when we don't evaluate the validation metrics. It will be logged anyways # afterwards when status['i'] % self.args.val_interval == 0 if status['i'] % self.args.val_interval != 0: # instantiate a validation_log with empty strings when no validation is done validation_data = [''] * len([key for key in header if 'valid' in key]) assert len(header) == len(train_data + validation_data) if self.args.tb: for key, value in zip(header, train_data): writer.add_scalar(key, float(value), status['num_frames']) csv_writer.writerow(train_data + validation_data) if status['i'] % self.args.val_interval == 0: valid_log = self.validate(self.args.val_episodes) mean_return = [np.mean(log['return_per_episode']) for log in valid_log] success_rate = [np.mean([1 if r > 0 else 0 for r in log['return_per_episode']]) for log in valid_log] val_log = self.run_epoch_recurrence(self.val_demos) validation_accuracy = np.mean(val_log["accuracy"]) if status['i'] % self.args.log_interval == 0: validation_data = [validation_accuracy] + mean_return + success_rate logger.info(("Validation: A {: .3f} " + ("| R {: .3f} " * len(mean_return) + "| S {: .3f} " * len(success_rate)) ).format(*validation_data)) assert len(header) == len(train_data + validation_data) if self.args.tb: for key, value in zip(header, train_data + validation_data): writer.add_scalar(key, float(value), status['num_frames']) csv_writer.writerow(train_data + validation_data) # In case of a multi-env, the update condition would be "better mean success rate" ! if np.mean(success_rate) > best_success_rate: best_success_rate = np.mean(success_rate) status['patience'] = 0 with open(status_path, 'w') as dst: json.dump(status, dst) # Saving the model logger.info("Saving best model") if torch.cuda.is_available(): self.acmodel.cpu() utils.save_model(self.acmodel, self.args.model + "_best") self.obss_preprocessor.vocab.save(utils.get_vocab_path(self.args.model + "_best")) if torch.cuda.is_available(): self.acmodel.cuda() else: status['patience'] += 1 logger.info( "Losing patience, new value={}, limit={}".format(status['patience'], self.args.patience)) if torch.cuda.is_available(): self.acmodel.cpu() utils.save_model(self.acmodel, self.args.model) self.obss_preprocessor.vocab.save() if torch.cuda.is_available(): self.acmodel.cuda() with open(status_path, 'w') as dst: json.dump(status, dst)
acmodel0 = ACModel(obss_preprocessor.obs_space, envs0[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, args.enc_dim, args.dec_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch, args.len_message, args.num_symbols) if acmodel1 is None: if args.pretrained_model: acmodel1 = utils.load_model(args.pretrained_model, 1, raise_not_found=True) else: #torch.manual_seed(args.seed) acmodel1 = ACModel(obss_preprocessor.obs_space, envs1[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, args.enc_dim, args.dec_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch, args.len_message, args.num_symbols) obss_preprocessor.vocab.save() utils.save_model(acmodel0, args.model, 0) utils.save_model(acmodel1, args.model, 1) if torch.cuda.is_available(): acmodel0.cuda() acmodel1.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo(envs0, envs1, acmodel0, acmodel1, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward)