def main(args, config): set_random_seed(args) # Environment env = get_environment(args, config) # Policy & Baseline policy = get_policy_for_env(args, env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # this is done to share memory across processes for multiprocessing baseline = LinearFeatureBaseline(reduce(mul, env.observation_space.shape, 1)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) # Meta Model metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) # Solver solver = Solver(args, config, policy, sampler, metalearner) solver.train(args, config)
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) config_filename = os.path.join(args.output_folder, str(args.seed) + '_config.json') policy_filename = os.path.join(args.output_folder, str(args.seed) + '_policy.th') result_filename_txt = os.path.join(args.output_folder, str(args.seed) + '_results.txt') result_filename_pickle = os.path.join( args.output_folder, str(args.seed) + '_results.pickle') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) results = { 'train_costs': [], 'test_costs': [], 'train_costs_sum': [], # the cost 'test_costs_sum': [], 'train_costs_mean': [], 'test_costs_mean': [], # the evaluation for grid-world, key-door, and mountain-car problems 'train_returns': [], 'test_returns': [], 'train_returns_mean': [], 'test_returns_mean': [], # the evaluation for the treasure problem 'train_returns_std': [], 'test_returns_std': [], } # env = gym.make(config['env-name'], **config['env-kwargs']) env = gym.make(config['env-name']) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 for batch in trange(config['num-batches']): tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) # print(tasks) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) # print(futures) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) # print('logs') train_episodes, valid_episodes = sampler.sample_wait(futures) # print('train_episodes') num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) train_returns = get_discounted_returns(train_episodes[0]) test_returns = get_discounted_returns(valid_episodes) train_costs = get_costs(train_episodes[0]) test_costs = get_costs(valid_episodes) # Save results results['train_returns'].append(train_returns) results['test_returns'].append(test_returns) results['train_returns_mean'].append(np.mean(train_returns)) results['test_returns_mean'].append(np.mean(test_returns)) results['train_returns_std'].append(np.std(train_returns)) results['test_returns_std'].append(np.std(test_returns)) results['train_costs'].append(train_costs) results['test_costs'].append(test_costs) results['train_costs_sum'].append(np.sum(train_costs)) results['test_costs_sum'].append(np.sum(test_costs)) results['train_costs_mean'].append(np.mean(train_costs)) results['test_costs_mean'].append(np.mean(test_costs)) with open(result_filename_txt, "w") as file: file.write(str(results)) with open(result_filename_pickle, "wb") as file: dump(results, file, protocol=2) # Save policy if args.output_folder is not None: with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f) print(np.sum(results['train_costs_sum']))
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) print(config) writer = SummaryWriter(logdir='./log') env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 for batch in trange(config['num-batches']): tasks = sampler.sample_tasks( num_tasks=config['meta-batch-size']) # (meta-batch-size, K-arm) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) # print(logs) writer.add_scalar('MAML/Loss Before', logs['loss_before'].mean(), num_iterations) writer.add_scalar('MAML/KL Before', logs['kl_before'].mean(), num_iterations) if 'loss_after' in logs: writer.add_scalar('MAML/Loss After', logs['loss_after'].mean(), num_iterations) if 'kl_after' in logs: writer.add_scalar('MAML/KL After', logs['kl_after'].mean(), num_iterations) writer.add_scalar('MAML/Train Returns', logs['train_returns'].sum(), num_iterations) writer.add_scalar('MAML/Valid Returns', logs['valid_returns'].sum(), num_iterations) writer.add_scalar( 'MAML/Train Cumulative Regret', sum([task['mean'].max() for task in logs['tasks']]) * config['fast-batch-size'] - logs['train_returns'].sum(), num_iterations) writer.add_scalar( 'MAML/Valid Cumulative Regret', sum([task['mean'].max() for task in logs['tasks']]) * config['fast-batch-size'] - logs['valid_returns'].sum(), num_iterations) # Save policy if args.output_folder is not None: with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f) writer.close()
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = gym.make(config['env-name'], **config['env-kwargs']) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() print(policy) # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config['env-kwargs'], batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers, args=args) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 for batch in trange(config['num-batches']): tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio'], args=args) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) # Save policy if args.output_folder is not None: with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.env_name in all_envs: config.update({ "env-name": args.env_name + "-v0", "env-kwargs": {}, "fast-batch-size": 16, "num-batches": 2000, "meta-batch-size": 1 }) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) model_name = "maml" os.makedirs(f"{LOG_DIR}/{model_name}/{config['env-name']}/", exist_ok=True) run_num = len([ n for n in os.listdir(f"{LOG_DIR}/{model_name}/{config['env-name']}/") ]) log_path = f"{LOG_DIR}/{model_name}/{config['env-name']}/logs_{run_num}.txt" env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() baseline = LinearFeatureBaseline(get_input_size(env)) sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) num_iterations = 0 total_rewards = [] start = time.time() step = 0 # for batch in range(config['num-batches']+1): while step <= 500000: tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) # Save policy old_step = step step += 250 if args.env_name in all_envs else train_episodes[0][ 0].lengths[0] if old_step == 0 or step // 1000 > old_step // 1000: rollouts = logs["valid_returns"][0] reward = np.mean(rollouts, -1) ep = step // 1000 total_rewards.append(reward) string = f"Step: {int(1000*ep):7d}, Reward: {total_rewards[-1]:9.3f} [{np.std(rollouts):8.3f}], Avg: {np.mean(total_rewards, axis=0):9.3f} ({0.0:.3f}) <{get_time(start)}> ({{}})" print(string) with open(log_path, "a+") as f: f.write(f"{string}\n")
def main(args): with open(args.config, 'r') as f: config = json.load(f) if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # env = gym.make(config['env-name'], **config['env-kwargs']) env = gym.make(config['env-name'], **config.get('env-kwargs', {})) env.close() # Policy policy = get_policy_for_env(env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) with open(args.policy, 'rb') as f: state_dict = torch.load(f, map_location=torch.device(args.device)) policy.load_state_dict(state_dict) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler(config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) # TODO: online adaptation step-1 metalearner = MAMLTRPO(policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) # endtodo logs = {'tasks': []} train_returns, valid_returns = [], [] num_iterations = 0 for batch in trange(args.num_batches): """old version of test-my.py""" # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # train_episodes, valid_episodes = sampler.sample(tasks, # num_steps=config['num-steps'], # fast_lr=config['fast-lr'], # gamma=config['gamma'], # gae_lambda=config['gae-lambda'], # device=args.device) # # logs['tasks'].extend(tasks) # train_returns.append(get_returns(train_episodes[0])) # valid_returns.append(get_returns(valid_episodes)) """new version of test-my-plus.py""" # TODO: online adaptation step-2 tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async(tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) logs = metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) train_episodes, valid_episodes = sampler.sample_wait(futures) num_iterations += sum( sum(episode.lengths) for episode in train_episodes[0]) num_iterations += sum( sum(episode.lengths) for episode in valid_episodes) logs.update(tasks=tasks, num_iterations=num_iterations, train_returns=get_returns(train_episodes[0]), valid_returns=get_returns(valid_episodes)) train_returns.append(get_returns(train_episodes[0])) valid_returns.append(get_returns(valid_episodes)) # for name,param in policy.layer1.named_parameters(): # print(name,param) # endtodo logs['train_returns'] = np.concatenate(train_returns, axis=0) logs['valid_returns'] = np.concatenate(valid_returns, axis=0) print('name', args.output) with open(args.output, 'wb') as f: np.savez(f, **logs)
def main(args): with open(args.config, 'r') as f: config = yaml.load(f, Loader=yaml.FullLoader) if args.output_folder is not None: if not os.path.exists(args.output_folder): os.makedirs(args.output_folder) policy_filename = os.path.join(args.output_folder, 'policy.th') config_filename = os.path.join(args.output_folder, 'config.json') with open(config_filename, 'w') as f: config.update(vars(args)) json.dump(config, f, indent=2) # Set tb_writer args.log_name = "env-name::%s_num-steps::%s_fast-lr::%s_log" % ( config["env-name"], config["num-steps"], config["fast-lr"]) tb_writer = SummaryWriter("./{0}/tb_{1}_logs".format(args.output_folder, args.log_name)) log = set_log(args) # Set seed if args.seed is not None: torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) env = make_env(config["env-name"])() env.close() # Policy policy = get_policy_for_env( env, hidden_sizes=config['hidden-sizes'], nonlinearity=config['nonlinearity']) policy.share_memory() # Baseline baseline = LinearFeatureBaseline(get_input_size(env)) # Sampler sampler = MultiTaskSampler( config['env-name'], env_kwargs=config.get('env-kwargs', {}), batch_size=config['fast-batch-size'], policy=policy, baseline=baseline, env=env, seed=args.seed, num_workers=args.num_workers) metalearner = MAMLTRPO( policy, fast_lr=config['fast-lr'], first_order=config['first-order'], device=args.device) best_score = -np.inf for batch in range(config['num-batches']): tasks = sampler.sample_tasks(num_tasks=config['meta-batch-size']) futures = sampler.sample_async( tasks, num_steps=config['num-steps'], fast_lr=config['fast-lr'], gamma=config['gamma'], gae_lambda=config['gae-lambda'], device=args.device) metalearner.step( *futures, max_kl=config['max-kl'], cg_iters=config['cg-iters'], cg_damping=config['cg-damping'], ls_max_steps=config['ls-max-steps'], ls_backtrack_ratio=config['ls-backtrack-ratio']) # For logging train_episodes, valid_episodes = sampler.sample_wait(futures) train_score = np.mean(get_returns(train_episodes[0])) val_score = np.mean(get_returns(valid_episodes)) log[args.log_name].info("At iteration {}, train_reward: {:.3f}".format(batch, train_score)) tb_writer.add_scalars("reward/", {"train": train_score}, batch) log[args.log_name].info("At iteration {}, valid_reward: {:.3f}".format(batch, val_score)) tb_writer.add_scalars("reward/", {"val": val_score}, batch) # Save policy if val_score > best_score: best_score = val_score log[args.log_name].info("Saving best valid score: {:.3f}".format(best_score)) with open(policy_filename, 'wb') as f: torch.save(policy.state_dict(), f)