def main(args): continuous_actions = True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): print("========== BATCH NUMBER {0} ==========".format(batch)) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch + 256)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0']) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearnerNGLVCVPG(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, verbose=args.verbose) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) start = time.time() episodes, kls, param_diffs = metalearner.sample(tasks, first_order=args.first_order, cg_iters=args.cg_iters) sample_time = time.time() - start start = time.time() if args.optimizer is 'sgd': metalearner.step_sgd(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) else: metalearner.step_adam(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) update_time = time.time() - start # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) writer.add_scalar('kl-mean between meta update', torch.mean(torch.stack(kls)), batch) writer.add_scalar('kl-std between meta update', torch.std(torch.stack(kls)), batch) writer.add_scalar('Euclidean-distance-mean between meta update', torch.mean(torch.stack(param_diffs)), batch) writer.add_scalar('Euclidean-distance-std between meta update', torch.std(torch.stack(param_diffs)), batch) print("Batch {}. before_update: {}, after_update: {}\n sample time {}, update_time {}".format(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]), sample_time, update_time)) print("Batch {}. kl-divergence between meta update: {}, kl std: {}".format( batch, torch.mean(torch.stack(kls)), torch.std(torch.stack(kls)))) print("Batch {}. Euclidean-distance-mean meta update: {}, Euclidean-distance-std: {}".format( batch, torch.mean(torch.stack(param_diffs)), torch.std(torch.stack(param_diffs)))) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): group_name = ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(4) ]) wandb.init(group=group_name, job_type='optimizer', tensorboard=True) wandb.config.update(args) device = torch.device(args.device) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=device.type) json.dump(config, f, indent=2) sampler = BatchSampler(group_name, args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): logging.basicConfig(filename=args.debug_file, level=logging.WARNING, filemode='w') logging.getLogger('metalearner').setLevel(logging.INFO) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) #if args.baseline == 'critic shared': # policy = NormalMLPPolicyA2C(int(np.prod(sampler.envs.observation_space.shape)), # int(np.prod(sampler.envs.action_space.shape)), # hidden_sizes=(args.hidden_size,) * args.num_layers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) if args.baseline == 'linear': baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) elif args.baseline == 'critic separate': baseline = CriticFunction( int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) #elif args.baseline == 'critic shared': # RANJANI TO DO metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, baseline_type=args.baseline, cliprange=args.cliprange, noptepochs=args.noptepochs, usePPO=args.usePPO, nminibatches=args.nminibatches, ppo_lr=args.ppo_lr, useSGD=args.useSGD, ppo_momentum=args.ppo_momentum, grad_clip=args.grad_clip) for batch in range(args.num_batches): print("*********************** Batch: " + str(batch) + " ****************************") print("Creating tasks...") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) print("Creating episodes...") episodes, grad_norm = metalearner.sample(tasks, first_order=args.first_order) print("Taking a meta step...") metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print("Writing results to tensorboard...") # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if grad_norm: writer.add_scalar('PPO mb grad norm', np.average(grad_norm)) print(np.average(grad_norm)) print("Saving policy network...") # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) print("***************************************************")
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if args.env_name == 'AntVel-v1': param_bounds = {"goal": [0, 3]} if args.env_name == 'AntPos-v0': param_bounds = {"x": [-3, 3], "y": [-3, 3]} teacher = TeacherController(args.teacher, args.nb_test_episodes, param_bounds, seed=args.seed, teacher_params={}) tree = TreeLSTM(args.tree_hidden_layer, len(param_bounds.keys()), args.cluster_0, args.cluster_1, device=args.device) if continuous_actions: policy = NormalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) else: policy = CategoricalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape)) + args.tree_hidden_layer) metalearner = MetaLearner(sampler, policy, baseline, tree=tree, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) all_tasks = [] for batch in range(args.num_batches): print("starting iteration {}".format(batch)) tasks = [] for _ in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': tasks.append( {"position": teacher.task_generator.sample_task()}) if args.env_name == 'AntVel-v1': tasks.append( {"velocity": teacher.task_generator.sample_task()[0]}) all_tasks.append(tasks) # tasks = np.array(tasks) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) with open('./logs/{0}/task_list.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(all_tasks, pf) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) tr = [ep.rewards for _, ep in episodes] tr = [torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr] print("rewards:", tr) for t in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': teacher.task_generator.update(tasks[t]["position"], tr[t]) if args.env_name == 'AntVel-v1': teacher.task_generator.update(np.array([tasks[t]["velocity"]]), tr[t]) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) # Save tree torch.save(tree, os.path.join(save_folder, 'tree-{0}.pt'.format(batch)))
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0' ]) assert continuous_actions == True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) log_traj_folder = './logs/{0}'.format(args.output_traj_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(log_traj_folder): os.makedirs(log_traj_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) # log_reward_total_file = open('./logs/reward_total.txt', 'a') # log_reward_dist_file = open('./logs/reward_dist.txt', 'a') # log_reward_col_file = open('./logs/reward_col.txt', 'a') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # print(sampler.envs.observation_space.shape) # print(sampler.envs.action_space.shape) # eewfe if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # baseline = LinearFeatureBaseline( # int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) resume_training = True if resume_training: saved_policy_path = os.path.join( './TrainingResults/result2//saves/{0}'.format( 'maml-2DNavigation-dir'), 'policy-180.pt') if os.path.isfile(saved_policy_path): print('Loading a saved policy') policy_info = torch.load(saved_policy_path) policy.load_state_dict(policy_info) else: sys.exit("The requested policy does not exist for loading") metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) start_time = time.time() for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # print("observations shape: ") # print(episodes[0][1].observations.shape) # ewerw # Tensorboard total_reward_be, dist_reward_be, col_reward_be = total_rewards( [ep.rewards for ep, _ in episodes]) total_reward_af, dist_reward_af, col_reward_af = total_rewards( [ep.rewards for _, ep in episodes]) log_reward_total_file = open('./logs/reward_total.txt', 'a') log_reward_dist_file = open('./logs/reward_dist.txt', 'a') log_reward_col_file = open('./logs/reward_col.txt', 'a') log_reward_total_file.write( str(batch) + ',' + str(total_reward_be) + ',' + str(total_reward_af) + '\n') log_reward_dist_file.write( str(batch) + ',' + str(dist_reward_be) + ',' + str(dist_reward_af) + '\n') log_reward_col_file.write( str(batch) + ',' + str(col_reward_be) + ',' + str(col_reward_af) + '\n') log_reward_total_file.close( ) # not sure if open and close immediantly will help save the appended logs in-place log_reward_dist_file.close() log_reward_col_file.close() writer.add_scalar('total_rewards/before_update', total_reward_be, batch) writer.add_scalar('total_rewards/after_update', total_reward_af, batch) writer.add_scalar('distance_reward/before_update', dist_reward_be, batch) writer.add_scalar('distance_reward/after_update', dist_reward_af, batch) writer.add_scalar('collison_rewards/before_update', col_reward_be, batch) writer.add_scalar('collison_rewards/after_update', col_reward_af, batch) if batch % args.save_every == 0: # maybe it can save time/space if the models are saved only periodically # Save policy network print('Saving model {}'.format(batch)) with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) if batch % 30 == 0: with open( os.path.join( log_traj_folder, 'train_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join( log_traj_folder, 'valid_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'train_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'valid_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) # save tasks # a sample task list of 2: [{'goal': array([0.0209588 , 0.15981938])}, {'goal': array([0.45034602, 0.17282322])}] with open( os.path.join(log_traj_folder, 'tasks_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump(tasks, f) else: # supposed to be overwritten for each batch with open( os.path.join(log_traj_folder, 'latest_train_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join(log_traj_folder, 'latest_valid_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_train_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_valid_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) with open(os.path.join(log_traj_folder, 'latest_tasks.pkl'), 'wb') as f: pickle.dump(tasks, f) print('finished epoch {}; time elapsed: {}'.format( batch, time_elapsed(time.time() - start_time)))
def main(args): wandb.config.update({ k: v for k, v in vars(args).items() if k in ['env_name', 'tau', 'critic_lr'] }) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, args.seed, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) critic = Critic(int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) metalearner = ActorCriticMetaLearner(sampler, policy, critic, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, critic_lr=args.critic_lr) wandb.watch(metalearner.critic) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) meta_critic_loss = metalearner.step( episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging wandb.log( { 'total_rewards/before_update': total_rewards([ep.rewards for ep, _ in episodes]) }, step=batch) wandb.log( { 'total_rewards/after_update': total_rewards([ep.rewards for _, ep in episodes]) }, step=batch) wandb.log({'meta critic loss': meta_critic_loss.detach().item()}, step=batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DPointEnvCorner-v0']) save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) if args.output_folder!='maml-trial' and args.output_folder!='trial': i=0 while os.path.exists(save_folder): args.output_folder=str(i+1) i+=1 save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder) os.makedirs(save_folder) writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder)) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.load_dir is not None: policy.load_state_dict(torch.load(args.load_dir)) metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Plotting figure # plotting(episodes, batch, save_folder,args.num_plots) if args.load_dir is not None: sys.exit(0) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): set_random_seed(args.random) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DNavigationBiased-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.alg)) save_folder = './saves/{0}'.format(args.alg) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, seed=args.random) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.alg == 'simul': # vanilla maml metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.meta_policy_num * args.num_batches): # first sample tasks under the distribution tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # get episodes in the form of (train episodes, test episodes after adaption) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar( 'maml/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'maml/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) elif args.alg == 'greedy': # multi-policy maml metalearner = KPolicyMetaLearner(sampler, policy, baseline, args.meta_policy_num, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # visualize the poolicies' behavior trajectories = [] for policy_idx in range(args.meta_policy_num): print(policy_idx) metalearner.optimize_policy_index(policy_idx) for batch in range(args.num_batches): print('batch num %d' % batch) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) metalearner.evaluate_optimized_policies(tasks) episodes = metalearner.sample(tasks, first_order=args.first_order) # loss is computed inside, then update policies metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # not sure what to write in tensorboard... for epIdx in range(len(episodes)): writer.add_scalar( 'kmaml/pi_' + str(policy_idx) + '_task_' + str(epIdx), total_rewards([episodes[epIdx][1].rewards]), batch) # use a random task (no update here anyway) to visualize meta-policies tasks = sampler.sample_tasks(num_tasks=1) trajectories.append(metalearner.sample_meta_policy(tasks[0])) plotTrajectories(trajectories)
def main(args): save_folder = f'saves/{args.output_folder + get_date_str()}' if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print('Initializing samplers...') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) test_sampler = BatchSampler(args.env_name, test_env=True, batch_size=args.fast_batch_size, num_workers=max(1, args.num_workers // 2)) policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) print('Initializing meta-learners...') metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 # NOTE: we need this metalearner only to sample test tasks test_metalearner = MetaLearner(test_sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 print('Starting the training') # Initialize logging wandb.init() wandb.config.update(args) task_name2id = {name: i for i, name in enumerate(sampler._env._task_names)} task_id2name = sampler._env._task_names task2prob = np.ones(sampler._env.num_tasks) / sampler._env.num_tasks uniform = np.ones_like(task2prob) / sampler._env.num_tasks # outer loop (meta-training) for i in range(args.num_batches): print(f'Batch {i}') # sample trajectories from random tasks print(f'\tSampling a batch of {args.meta_batch_size} training tasks') tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, task2prob=0.99 * task2prob + 0.01 * uniform) # Note: Dirty hack to overcome metaworld dirty hack task_names = [sampler._env._task_names[t['task']] for t in tasks] # inner loop (adaptation) # returns list of tuples (train_episodes, valid_episodes) print(f'\tTraining') episodes = metalearner.sample(tasks, first_order=args.first_order) print(f'\tUpdating the meta-model') metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging # before: before parameters update # after: after parameters adaptation to the task r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards/before_update': r_before, 'total_rewards/after_update': r_after, 'success_rate/before_update': success_rate_before, 'success_rate/after_update': success_rate_after, 'success_rate/improvement': success_rate_after - success_rate_before, 'success_rate/before_update_macro': np.mean(list(task_success_rate_before.values())), 'success_rate/after_update_macro': np.mean(list(task_success_rate_after.values())), }, step=i) wandb.log( { f'success_rate/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, step=i) wandb.log( { f'success_rate/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, step=i) wandb.log( { f'success_rate/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) wandb.log( { f'n_acquired_tasks/before_update/at_{x}': sum( rate > x for rate in task_success_rate_before.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) wandb.log( { f'n_acquired_tasks/after_update/at_{x}': sum( rate > x for rate in task_success_rate_after.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) if args.active_learning: new_task2prob = np.zeros_like(task2prob) if args.prob_f == 'linear': norm = 1e-7 + sum(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = 1. - rate / norm elif args.prob_f == 'softmax': # softmax(1 - rate) # numerical stability trick # http://cs231n.github.io/linear-classify/#softmax max_f = 1 - min(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] f = 1 - rate new_task2prob[task_id] = np.exp( (f - max_f) / args.temperature) new_task2prob = new_task2prob / (1e-7 + sum(new_task2prob)) elif args.prob_f == 'softmax2': # 1 - softmax(rate) max_f = max(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = np.exp( (rate - max_f) / args.temperature) new_task2prob = 1. - new_task2prob / (1e-7 + sum(new_task2prob)) else: raise RuntimeError( 'prob-f should be either "softmax", "softmax2" or "linear"' ) alpha = args.success_rate_smoothing task2prob = alpha * task2prob + (1 - alpha) * new_task2prob task2prob /= sum(task2prob) assert all(task2prob > 0) # strictly! wandb.log( { f'task2prob/{task_id2name[task_id]}': prob for task_id, prob in enumerate(task2prob) }, step=i) # meta-test if i % args.eval_every == 0: print(f'Evaluating on meta-test') # save policy network _save_path = os.path.join(save_folder, 'policy-{0}.pt'.format(i)) with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path) # Evaluate on meta-test tasks = test_sampler.sample_tasks(num_tasks=2 * args.meta_batch_size) # Note: Dirty hack to overcome metaworld dirty hack task_names = [ test_sampler._env._task_names[t['task']] for t in tasks ] episodes = test_metalearner.sample(tasks, first_order=args.first_order) r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards_test/before_update': r_before, 'total_rewards_test/after_update': r_after, 'success_rate_test/before_update': success_rate_before, 'success_rate_test/after_update': success_rate_after, 'success_rate_test/improvement': success_rate_after - success_rate_before }, step=i) wandb.log( { f'success_rate_test/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) print('Saving the final model') # save final policy _save_path = os.path.join(save_folder, 'policy-final.pt') with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner( sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, q_inner=args.inner_q == 'true', q_residuce_gradient=args.inner_q_residue_gradient == 'true', q_soft=args.inner_q_soft == 'true', q_soft_temp=args.inner_q_soft_temp, device=args.device, ) for batch in range(args.num_batches): if args.device.type == 'cuda': torch.cuda.empty_cache() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes, adaptation_info = metalearner.sample( tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard pre_update_rewards = total_rewards([ep.rewards for ep, _ in episodes]) post_update_rewards = total_rewards([ep.rewards for _, ep in episodes]) writer.add_scalar('total_rewards/before_update', pre_update_rewards, batch) writer.add_scalar('total_rewards/after_update', post_update_rewards, batch) writer.add_scalar('total_rewards/rewards_improvement', post_update_rewards - pre_update_rewards, batch) writer.add_scalar('adaptation/pre_update_inner_loss', adaptation_info.mean_pre_update_loss, batch) writer.add_scalar('adaptation/post_update_inner_loss', adaptation_info.mean_post_update_loss, batch) writer.add_scalar('adaptation/inner_loss_improvement', adaptation_info.mean_loss_improvment, batch) writer.add_scalar('adaptation/weight_change', adaptation_info.mean_weight_change, batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
task_success = [] for _ in range(buckets): tasks = env.unwrapped.sample_tasks(num_test_tasks) success = 0 #times = [] metalearner = gradient_step(0, tasks, args) for task in tasks: s = env.reset_task(task) step = 0 d = False while not d: #env.render() input = torch.tensor(s).float() action = model.forward( input, model.state_dict()).rsample().detach().numpy() step += 1 s, r, d, info = env.step(action) if r == 1: success += 1 # times.append(step) # maml.append(times) task_success.append(success / num_test_tasks) successes.append(task_success) env.close() #out = [successes, maml] if not os.path.exists('./pkls'): os.makedirs('./pkls') with open('./pkls/{0}.pkl'.format(args.output_folder), 'wb') as f: pickle.dump(successes, f)