def main(args): wandb.config.update({ k: v for k, v in vars(args).items() if k in ['env_name', 'tau', 'critic_lr'] }) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, args.seed, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) critic = Critic(int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) metalearner = ActorCriticMetaLearner(sampler, policy, critic, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, critic_lr=args.critic_lr) wandb.watch(metalearner.critic) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) meta_critic_loss = metalearner.step( episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging wandb.log( { 'total_rewards/before_update': total_rewards([ep.rewards for ep, _ in episodes]) }, step=batch) wandb.log( { 'total_rewards/after_update': total_rewards([ep.rewards for _, ep in episodes]) }, step=batch) wandb.log({'meta critic loss': meta_critic_loss.detach().item()}, step=batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): # Setup for logging tb_writer = SummaryWriter('./logs/tb_{}'.format( args.log_name)) # Tensorboard logging log = set_log(args) # Setup before meta-train starts sampler = BatchSampler(env_name=args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, args=args) # NOTE Observation space is a list with [predator0, predator1, ..., prey] # Thus using the index of 0 policy = NormalMLPPolicy( input_size=int(np.prod(sampler.envs.observation_space[0].shape)), output_size=int(np.prod(sampler.envs.action_space[0].shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( input_size=int(np.prod(sampler.envs.observation_space[0].shape))) meta_learner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) # meta_learner.load( # filename="theta_200", directory="./pytorch_models") meta_tester = MetaTester(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) prey = Prey(env=sampler._env, args=args, log=log, tb_writer=tb_writer, name="prey", i_agent=0) # Meta-train starts iteration = 0 while True: # Sample train and validation episode tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, test=False) episodes = meta_learner.sample(tasks, prey, first_order=args.first_order, iteration=iteration) # Train meta-policy meta_learner.step(episodes=episodes, args=args) # Test meta-policy if iteration % 10 == 0: test_tasks = sampler.sample_tasks(num_tasks=5, test=True) meta_tester.few_shot_adaptation(meta_policy=meta_learner.policy, tasks=test_tasks, first_order=args.first_order, iteration=iteration, prey=prey) if iteration % 100 == 0: meta_learner.save(iteration) iteration += 1
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) writer = SummaryWriter(log_dir=args.log_dir) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) logger.logkv('return_avg_pre', total_rewards([ep.rewards for ep, _ in episodes])) logger.logkv('return_avg_post', total_rewards([ep.rewards for _, ep in episodes])) logger.dumpkvs()
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DPointEnvCorner-v0']) save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) if args.output_folder!='maml-trial' and args.output_folder!='trial': i=0 while os.path.exists(save_folder): args.output_folder=str(i+1) i+=1 save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder) os.makedirs(save_folder) writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder)) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.load_dir is not None: policy.load_state_dict(torch.load(args.load_dir)) metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Plotting figure # plotting(episodes, batch, save_folder,args.num_plots) if args.load_dir is not None: sys.exit(0) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): set_random_seed(args.random) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DNavigationBiased-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.alg)) save_folder = './saves/{0}'.format(args.alg) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, seed=args.random) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.alg == 'simul': # vanilla maml metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.meta_policy_num * args.num_batches): # first sample tasks under the distribution tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # get episodes in the form of (train episodes, test episodes after adaption) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar( 'maml/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'maml/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) elif args.alg == 'greedy': # multi-policy maml metalearner = KPolicyMetaLearner(sampler, policy, baseline, args.meta_policy_num, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # visualize the poolicies' behavior trajectories = [] for policy_idx in range(args.meta_policy_num): print(policy_idx) metalearner.optimize_policy_index(policy_idx) for batch in range(args.num_batches): print('batch num %d' % batch) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) metalearner.evaluate_optimized_policies(tasks) episodes = metalearner.sample(tasks, first_order=args.first_order) # loss is computed inside, then update policies metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # not sure what to write in tensorboard... for epIdx in range(len(episodes)): writer.add_scalar( 'kmaml/pi_' + str(policy_idx) + '_task_' + str(epIdx), total_rewards([episodes[epIdx][1].rewards]), batch) # use a random task (no update here anyway) to visualize meta-policies tasks = sampler.sample_tasks(num_tasks=1) trajectories.append(metalearner.sample_meta_policy(tasks[0])) plotTrajectories(trajectories)
baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, the_model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) test_batch_size = 2 test_reward_before = [] test_reward_after = [] for test_batch in range(test_batch_size): #sample one task test_task = sampler.sample_tasks(num_tasks=1) print("test_task: ", test_task) sampler.reset_task(test_task[0]) #sample some episodes for that task episodes = metalearner.sample(test_task, first_order=args.first_order) test_reward_before.append(total_rewards([ep.rewards for ep, _ in episodes])) test_reward_after.append(total_rewards([ep.rewards for _, ep in episodes])) print("before:", test_reward_before, "; after: ", test_reward_after, "\n") print("before average: ", np.mean(test_reward_before), "after average: ", np.mean(test_reward_after))
def main(args): save_folder = f'saves/{args.output_folder + get_date_str()}' if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print('Initializing samplers...') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) test_sampler = BatchSampler(args.env_name, test_env=True, batch_size=args.fast_batch_size, num_workers=max(1, args.num_workers // 2)) policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) print('Initializing meta-learners...') metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 # NOTE: we need this metalearner only to sample test tasks test_metalearner = MetaLearner(test_sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 print('Starting the training') # Initialize logging wandb.init() wandb.config.update(args) task_name2id = {name: i for i, name in enumerate(sampler._env._task_names)} task_id2name = sampler._env._task_names task2prob = np.ones(sampler._env.num_tasks) / sampler._env.num_tasks uniform = np.ones_like(task2prob) / sampler._env.num_tasks # outer loop (meta-training) for i in range(args.num_batches): print(f'Batch {i}') # sample trajectories from random tasks print(f'\tSampling a batch of {args.meta_batch_size} training tasks') tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, task2prob=0.99 * task2prob + 0.01 * uniform) # Note: Dirty hack to overcome metaworld dirty hack task_names = [sampler._env._task_names[t['task']] for t in tasks] # inner loop (adaptation) # returns list of tuples (train_episodes, valid_episodes) print(f'\tTraining') episodes = metalearner.sample(tasks, first_order=args.first_order) print(f'\tUpdating the meta-model') metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging # before: before parameters update # after: after parameters adaptation to the task r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards/before_update': r_before, 'total_rewards/after_update': r_after, 'success_rate/before_update': success_rate_before, 'success_rate/after_update': success_rate_after, 'success_rate/improvement': success_rate_after - success_rate_before, 'success_rate/before_update_macro': np.mean(list(task_success_rate_before.values())), 'success_rate/after_update_macro': np.mean(list(task_success_rate_after.values())), }, step=i) wandb.log( { f'success_rate/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, step=i) wandb.log( { f'success_rate/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, step=i) wandb.log( { f'success_rate/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) wandb.log( { f'n_acquired_tasks/before_update/at_{x}': sum( rate > x for rate in task_success_rate_before.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) wandb.log( { f'n_acquired_tasks/after_update/at_{x}': sum( rate > x for rate in task_success_rate_after.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) if args.active_learning: new_task2prob = np.zeros_like(task2prob) if args.prob_f == 'linear': norm = 1e-7 + sum(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = 1. - rate / norm elif args.prob_f == 'softmax': # softmax(1 - rate) # numerical stability trick # http://cs231n.github.io/linear-classify/#softmax max_f = 1 - min(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] f = 1 - rate new_task2prob[task_id] = np.exp( (f - max_f) / args.temperature) new_task2prob = new_task2prob / (1e-7 + sum(new_task2prob)) elif args.prob_f == 'softmax2': # 1 - softmax(rate) max_f = max(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = np.exp( (rate - max_f) / args.temperature) new_task2prob = 1. - new_task2prob / (1e-7 + sum(new_task2prob)) else: raise RuntimeError( 'prob-f should be either "softmax", "softmax2" or "linear"' ) alpha = args.success_rate_smoothing task2prob = alpha * task2prob + (1 - alpha) * new_task2prob task2prob /= sum(task2prob) assert all(task2prob > 0) # strictly! wandb.log( { f'task2prob/{task_id2name[task_id]}': prob for task_id, prob in enumerate(task2prob) }, step=i) # meta-test if i % args.eval_every == 0: print(f'Evaluating on meta-test') # save policy network _save_path = os.path.join(save_folder, 'policy-{0}.pt'.format(i)) with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path) # Evaluate on meta-test tasks = test_sampler.sample_tasks(num_tasks=2 * args.meta_batch_size) # Note: Dirty hack to overcome metaworld dirty hack task_names = [ test_sampler._env._task_names[t['task']] for t in tasks ] episodes = test_metalearner.sample(tasks, first_order=args.first_order) r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards_test/before_update': r_before, 'total_rewards_test/after_update': r_after, 'success_rate_test/before_update': success_rate_before, 'success_rate_test/after_update': success_rate_after, 'success_rate_test/improvement': success_rate_after - success_rate_before }, step=i) wandb.log( { f'success_rate_test/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) print('Saving the final model') # save final policy _save_path = os.path.join(save_folder, 'policy-final.pt') with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner( sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, q_inner=args.inner_q == 'true', q_residuce_gradient=args.inner_q_residue_gradient == 'true', q_soft=args.inner_q_soft == 'true', q_soft_temp=args.inner_q_soft_temp, device=args.device, ) for batch in range(args.num_batches): if args.device.type == 'cuda': torch.cuda.empty_cache() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes, adaptation_info = metalearner.sample( tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard pre_update_rewards = total_rewards([ep.rewards for ep, _ in episodes]) post_update_rewards = total_rewards([ep.rewards for _, ep in episodes]) writer.add_scalar('total_rewards/before_update', pre_update_rewards, batch) writer.add_scalar('total_rewards/after_update', post_update_rewards, batch) writer.add_scalar('total_rewards/rewards_improvement', post_update_rewards - pre_update_rewards, batch) writer.add_scalar('adaptation/pre_update_inner_loss', adaptation_info.mean_pre_update_loss, batch) writer.add_scalar('adaptation/post_update_inner_loss', adaptation_info.mean_post_update_loss, batch) writer.add_scalar('adaptation/inner_loss_improvement', adaptation_info.mean_loss_improvment, batch) writer.add_scalar('adaptation/weight_change', adaptation_info.mean_weight_change, batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearnerNGLVCVPG(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, verbose=args.verbose) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) for batch in range(args.num_batches): start = time.time() episodes = metalearner.sample(tasks, first_order=args.first_order, cg_iters=args.cg_iters) sample_time = time.time() - start start = time.time() if args.optimizer is 'sgd': metalearner.step_sgd(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) else: metalearner.step_adam(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) update_time = time.time() - start # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) print( "Batch {}. before_update: {}, after_update: {}\n sample time {}, update_time {}" .format(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]), sample_time, update_time)) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)