assert_text = "ONE of --model or --demos-origin or --demos must be specified." assert int(args.model is None) + int(args.demos_origin is None) + int( args.demos is None) == 2, assert_text if args.seed is None: args.seed = 0 if args.model is not None else 1 start_time = time.time() logs = main(args, args.seed, args.episodes) end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames / (end_time - start_time) ellapsed_time = int(end_time - start_time) duration = datetime.timedelta(seconds=ellapsed_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) success_per_episode = utils.synthesize( [1 if r > 0 else 0 for r in logs["return_per_episode"]]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) print( "F {} | FPS {:.0f} | D {} | R:xsmM {:.2f} {:.2f} {:.2f} {:.2f} | S {:.2f} | F:xsmM {:.1f} {:.1f} {} {}" .format(num_frames, fps, duration, *return_per_episode.values(), success_per_episode['mean'], *num_frames_per_episode.values())) indexes = sorted(range(len(logs["return_per_episode"])), key=lambda k: logs["return_per_episode"][k]) n = 10 print("{} worst episodes:".format(n)) for i in indexes[:n]: print("- episode {}: R={}, F={}".format(
def main(): # Generate environments envs = [] for i in range(args.procs): env = gym.make(args.env) env.seed(100 * args.seed + i) envs.append(env) # Define model name suffix = datetime.datetime.now().strftime("%y-%m-%d-%H-%M-%S") instr = args.instr_arch if args.instr_arch else "noinstr" mem = "mem" if not args.no_mem else "nomem" model_name_parts = { 'env': args.env, 'algo': args.algo, 'arch': args.arch, 'instr': instr, 'mem': mem, 'seed': args.seed, 'info': '', 'coef': '', 'suffix': suffix} default_model_name = "{env}_{algo}_{arch}_{instr}_{mem}_seed{seed}{info}{coef}_{suffix}".format(**model_name_parts) if args.pretrained_model: default_model_name = args.pretrained_model + '_pretrained_' + default_model_name args.model = args.model.format(**model_name_parts) if args.model else default_model_name utils.configure_logging(args.model) logger = logging.getLogger(__name__) # Define obss preprocessor if 'emb' in args.arch: obss_preprocessor = utils.IntObssPreprocessor(args.model, envs[0].observation_space, args.pretrained_model) else: """ obss_preprocessor = utils.ObssPreprocessor(args.model, envs[0].observation_space, args.pretrained_model) """ obss_preprocessor = utils.ImgInstrObssPreprocessor(args.model, envs[0].observation_space) # Define actor-critic model acmodel = utils.load_model(args.model, raise_not_found=False) if acmodel is None: if args.pretrained_model: acmodel = utils.load_model(args.pretrained_model, raise_not_found=True) else: """ acmodel = ACModel(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, args.instr_arch, not args.no_mem, args.arch) """ acmodel = ACModelImgInstr(obss_preprocessor.obs_space, envs[0].action_space, args.image_dim, args.memory_dim, args.instr_dim, not args.no_instr, not args.no_mem, args.arch) """ obss_preprocessor.vocab.save() """ utils.save_model(acmodel, args.model) if torch.cuda.is_available(): acmodel.cuda() # Define actor-critic algo reshape_reward = lambda _0, _1, reward, _2: args.reward_scale * reward if args.algo == "ppo": algo = babyai.rl.PPOAlgo(envs, acmodel, args.frames_per_proc, args.discount, args.lr, args.beta1, args.beta2, args.gae_lambda, args.entropy_coef, args.value_loss_coef, args.max_grad_norm, args.recurrence, args.optim_eps, args.clip_eps, args.ppo_epochs, args.batch_size, obss_preprocessor, reshape_reward) else: raise ValueError("Incorrect algorithm name: {}".format(args.algo)) # When using extra binary information, more tensors (model params) are initialized compared to when we don't use that. # Thus, there starts to be a difference in the random state. If we want to avoid it, in order to make sure that # the results of supervised-loss-coef=0. and extra-binary-info=0 match, we need to reseed here. utils.seed(args.seed) # Restore training status status_path = os.path.join(utils.get_log_dir(args.model), 'status.json') if os.path.exists(status_path): with open(status_path, 'r') as src: status = json.load(src) else: status = {'i': 0, 'num_episodes': 0, 'num_frames': 0} # Define logger and Tensorboard writer and CSV writer header = (["update", "episodes", "frames", "FPS", "duration"] + ["return_" + stat for stat in ['mean', 'std', 'min', 'max']] + ["success_rate"] + ["num_frames_" + stat for stat in ['mean', 'std', 'min', 'max']] + ["entropy", "value", "policy_loss", "value_loss", "loss", "grad_norm"]) if args.tb: from tensorboardX import SummaryWriter writer = SummaryWriter(utils.get_log_dir(args.model)) csv_path = os.path.join(utils.get_log_dir(args.model), 'log.csv') first_created = not os.path.exists(csv_path) # we don't buffer data going in the csv log, cause we assume # that one update will take much longer that one write to the log csv_writer = csv.writer(open(csv_path, 'a', 1)) if first_created: csv_writer.writerow(header) # Log code state, command, availability of CUDA and model babyai_code = list(babyai.__path__)[0] try: last_commit = subprocess.check_output( 'cd {}; git log -n1'.format(babyai_code), shell=True).decode('utf-8') logger.info('LAST COMMIT INFO:') logger.info(last_commit) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') try: diff = subprocess.check_output( 'cd {}; git diff'.format(babyai_code), shell=True).decode('utf-8') if diff: logger.info('GIT DIFF:') logger.info(diff) except subprocess.CalledProcessError: logger.info('Could not figure out the last commit') logger.info('COMMAND LINE ARGS:') logger.info(args) logger.info("CUDA available: {}".format(torch.cuda.is_available())) logger.info(acmodel) # Train model total_start_time = time.time() best_success_rate = 0 best_mean_return = 0 test_env_name = args.env while status['num_frames'] < args.frames: # Update parameters update_start_time = time.time() logs = algo.update_parameters() update_end_time = time.time() status['num_frames'] += logs["num_frames"] status['num_episodes'] += logs['episodes_done'] status['i'] += 1 # Print logs if status['i'] % args.log_interval == 0: total_ellapsed_time = int(time.time() - total_start_time) fps = logs["num_frames"] / (update_end_time - update_start_time) duration = datetime.timedelta(seconds=total_ellapsed_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) success_per_episode = utils.synthesize( [1 if r > 0 else 0 for r in logs["return_per_episode"]]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) data = [status['i'], status['num_episodes'], status['num_frames'], fps, total_ellapsed_time, *return_per_episode.values(), success_per_episode['mean'], *num_frames_per_episode.values(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["loss"], logs["grad_norm"]] format_str = ("U {} | E {} | F {:06} | FPS {:04.0f} | D {} | R:xsmM {: .2f} {: .2f} {: .2f} {: .2f} | " "S {:.2f} | F:xsmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | " "pL {: .3f} | vL {:.3f} | L {:.3f} | gN {:.3f} | ") logger.info(format_str.format(*data)) if args.tb: assert len(header) == len(data) for key, value in zip(header, data): writer.add_scalar(key, float(value), status['num_frames']) csv_writer.writerow(data) # Save obss preprocessor vocabulary and model if args.save_interval > 0 and status['i'] % args.save_interval == 0: """ obss_preprocessor.vocab.save() """ with open(status_path, 'w') as dst: json.dump(status, dst) utils.save_model(acmodel, args.model) # Testing the model before saving agent = ModelAgent(args.model, obss_preprocessor, argmax=True) agent.model = acmodel agent.model.eval() logs = batch_evaluate(agent, test_env_name, args.val_seed, args.val_episodes) agent.model.train() mean_return = np.mean(logs["return_per_episode"]) success_rate = np.mean([1 if r > 0 else 0 for r in logs['return_per_episode']]) save_model = False if success_rate > best_success_rate: best_success_rate = success_rate save_model = True elif (success_rate == best_success_rate) and (mean_return > best_mean_return): best_mean_return = mean_return save_model = True if save_model: utils.save_model(acmodel, args.model + '_best') """ obss_preprocessor.vocab.save(utils.get_vocab_path(args.model + '_best')) """ logger.info("Return {: .2f}; best model is saved".format(mean_return)) else: logger.info("Return {: .2f}; not the best model; not saved".format(mean_return))
update_start_time = time.time() logs, theta = algo.update_parameters(fake_reward=0) if args.fake_reward > 0: logs, _ = algo.update_parameters(fake_reward=args.fake_reward, cached_theta=theta) update_end_time = time.time() num_frames += logs["num_frames"] i += (1 if args.fake_reward == 0 else 2) # Print logs if i % args.log_interval == 0: total_ellapsed_time = int(time.time() - total_start_time) fps = logs["num_frames"] / (update_end_time - update_start_time) duration = datetime.timedelta(seconds=total_ellapsed_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) rreturn_per_episode = utils.synthesize(logs["reshaped_return_per_episode"]) num_frames_per_episode = utils.synthesize(logs["num_frames_per_episode"]) logger.info( "U {} | F {:06} | FPS {:04.0f} | D {} | rR:x̄σmM {: .2f} {: .2f} {: .2f} {: .2f} | F:x̄σmM {:.1f} {:.1f} {} {} | H {:.3f} | V {:.3f} | pL {: .3f} | vL {:.3f}" .format(i, num_frames, fps, duration, *rreturn_per_episode.values(), *num_frames_per_episode.values(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"])) if args.tb: writer.add_scalar("frames", num_frames, i) writer.add_scalar("FPS", fps, i) writer.add_scalar("duration", total_ellapsed_time, i) for key, value in return_per_episode.items(): writer.add_scalar("return_" + key, value, i)
def main(args, seed, episodes): # Set seed for all randomness sources utils.seed(seed) # Keep track of results per task. results = {} for env_name in args.env: start_time = time.time() env = gym.make(env_name) env.seed(seed) if args.model is None and args.episodes > len(agent.demos): # Set the number of episodes to be the number of demos episodes = len(agent.demos) # Define agent agent = utils.load_agent(env, args.model, args.demos, args.demos_origin, args.argmax, env_name, model_path=args.model_path) # Evaluate if isinstance(agent, utils.DemoAgent): logs = evaluate_demo_agent(agent, episodes) elif isinstance(agent, utils.BotAgent) or args.contiguous_episodes: logs = evaluate(agent, env, episodes, False) else: logs = batch_evaluate(agent, env_name, seed, episodes) end_time = time.time() # Print logs num_frames = sum(logs["num_frames_per_episode"]) fps = num_frames / (end_time - start_time) ellapsed_time = int(end_time - start_time) duration = datetime.timedelta(seconds=ellapsed_time) if args.model is not None: return_per_episode = utils.synthesize(logs["return_per_episode"]) success_per_episode = utils.synthesize( [1 if r > 0 else 0 for r in logs["return_per_episode"]]) num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) if args.model is not None: print( "F {} | FPS {:.0f} | D {} | R:xsmM {:.3f} {:.3f} {:.3f} {:.3f} | S {:.3f} | F:xsmM {:.1f} {:.1f} {} {}" .format(num_frames, fps, duration, *return_per_episode.values(), success_per_episode['mean'], *num_frames_per_episode.values())) else: print( "F {} | FPS {:.0f} | D {} | F:xsmM {:.1f} {:.1f} {} {}".format( num_frames, fps, duration, *num_frames_per_episode.values())) indexes = sorted(range(len(logs["num_frames_per_episode"])), key=lambda k: -logs["num_frames_per_episode"][k]) n = args.worst_episodes_to_show if n > 0: print("{} worst episodes:".format(n)) for i in indexes[:n]: if 'seed_per_episode' in logs: print(logs['seed_per_episode'][i]) if args.model is not None: print("- episode {}: R={}, F={}".format( i, logs["return_per_episode"][i], logs["num_frames_per_episode"][i])) else: print("- episode {}: F={}".format( i, logs["num_frames_per_episode"][i])) # Store results for this env. logs['return_per_episode'] = return_per_episode logs['success_per_episode'] = success_per_episode logs['num_frames_per_episode'] = num_frames_per_episode results[env_name] = logs return results
logs = algo.update_parameters() update_end_time = time.time() status['num_frames'] += logs["num_frames"] status['num_episodes'] += logs['episodes_done'] status['i'] += 1 # Print logs if status['i'] % args.log_interval == 0: print(args.description) total_ellapsed_time = int(time.time() - total_start_time) fps = -1 fps = logs["num_frames"] / (update_end_time - update_start_time) duration = datetime.timedelta(seconds=total_ellapsed_time) return_per_episode = utils.synthesize(logs["return_per_episode"]) success_per_episode = utils.synthesize([ 1 if r > 2 else 0 for r in logs["return_per_episode"] ]) # TODO: change based on dense rewards or not num_frames_per_episode = utils.synthesize( logs["num_frames_per_episode"]) og_data = [ status['i'], status['num_episodes'], status['num_frames'], fps, total_ellapsed_time, *return_per_episode.values(), success_per_episode['mean'], *num_frames_per_episode.values(), logs["entropy"], logs["value"], logs["policy_loss"], logs["value_loss"], logs["loss"], logs["grad_norm"] ] data = [