def main(args): continuous_actions = (args.env_name in [ 'AntVelEnv-v1', 'AntDirEnv-v1', 'HalfCheetahVelEnv-v1', 'HalfCheetahDirEnv-v1', '2DNavigation-v0' ]) save_folder = os.path.join('tmp', args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) # Load model with open(args.model, 'rb') as f: state_dict = torch.load(f) policy.load_state_dict(state_dict) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) args.meta_batch_size = 81 # velocities = np.linspace(-1., 3., num=args.meta_batch_size) # tasks = [{'velocity': velocity} for velocity in velocities] tasks = [{'direction': direction} for direction in [-1, 1]] for batch in range(args.num_batches): episodes = metalearner.sample(tasks) train_returns = [ep.rewards.sum(0).cpu().numpy() for ep, _ in episodes] valid_returns = [ep.rewards.sum(0).cpu().numpy() for _, ep in episodes] with open(os.path.join(save_folder, '{0}.npz'.format(batch)), 'wb') as f: np.savez(f, train=train_returns, valid=valid_returns) print('Batch {0}'.format(batch))
def main(args): env_name = 'RVONavigationAll-v0' #['2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0'] test_folder = './{0}'.format('test_nav') fast_batch_size = 40 # number of trajectories saved_policy_file = os.path.join( './TrainingResults/result3/saves/{0}'.format('maml-2DNavigation-dir'), 'policy-180.pt') sampler = BatchSampler(env_name, batch_size=fast_batch_size, num_workers=3) policy = NormalMLPPolicy(int(np.prod( sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(100, ) * 2) # Loading policy if os.path.isfile(saved_policy_file): policy_info = torch.load(saved_policy_file, map_location=lambda storage, loc: storage) policy.load_state_dict(policy_info) print('Loaded saved policy') else: sys.exit("The requested policy does not exist for loading") # Creating test folder if not os.path.exists(test_folder): os.makedirs(test_folder) # Generate tasks # goal = [[-0.8, 0.9]] # task = [{'goal': goal}][0] tasks = sampler.sample_tasks(num_tasks=1) task = tasks[0] # Start validation print("Starting to test...Total step = ", args.grad_steps) start_time = time.time() # baseline = LinearFeatureBaseline(int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) metalearner = MetaLearner(sampler, policy, baseline, gamma=0.9, fast_lr=0.01, tau=0.99, device='cpu') # test_episodes = metalearner.sample(tasks) # for train, valid in test_episodes: # total_reward, dist_reward, col_reward = total_rewards(train.rewards) # print(total_reward) # total_reward, dist_reward, col_reward = total_rewards(valid.rewards) # print(total_reward) test_episodes = metalearner.test(task, n_grad=args.grad_steps) print('-------------------') for n_grad, ep in test_episodes: total_reward, dist_reward, col_reward = total_rewards(ep.rewards) print(total_reward) # with open(os.path.join(test_folder, 'test_episodes_grad'+str(n_grad)+'.pkl'), 'wb') as f: # pickle.dump([ep.observations.cpu().numpy(), ep], f) # with open(os.path.join(test_folder, 'task.pkl'), 'wb') as f: # pickle.dump(task, f) print('Finished test. Time elapsed = {}'.format( time_elapsed(time.time() - start_time)))
def eval(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) log_folder = './logs/{0}'.format(args.output_folder) if not os.path.exists(log_folder): os.makedirs(log_folder) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if args.env_name == 'AntPos-v0': param_bounds = {"x": [-3, 3], "y": [-3, 3]} tree = TreeLSTM(args.tree_hidden_layer, len(param_bounds.keys()), args.cluster_0, args.cluster_1, device=args.device) if continuous_actions: policy = NormalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) policy.eval() tree.eval() all_tasks = [] # torch.autograd.set_detect_anomaly(True) reward_list = [] for batch in range(args.num_batches + 1): print("starting iteration {}".format(batch)) try: policy.load_state_dict( torch.load( os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) tree = torch.load( os.path.join(save_folder, 'tree-{0}.pt'.format(batch))) tree.eval() except Exception: with open( './logs/{0}/reward_list_eval.pkl'.format( args.output_folder), 'wb') as pf: pickle.dump(reward_list, pf) print(reward_list) return # tree.load_state_dict(torch.load(os.path.join(save_folder, # 'tree-{0}.pt'.format(batch)))) tasks = sampler.sample_tasks(args.meta_batch_size) all_tasks.append(tasks) # tasks = np.array(tasks) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) with open('./logs/{0}/task_list_eval.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(all_tasks, pf) print("evaluating...".format(batch)) all_rewards = [] for task in tasks: print(task["position"]) episodes = sampler.sample(policy, task, tree=tree) # print("training...".format(batch)) # tr = [ep.rewards for ep in episodes] # tr = np.mean([torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr]) all_rewards.append(total_rewards(episodes.rewards)) reward_list.append(np.mean(all_rewards)) with open('./logs/{0}/reward_list_eval.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(reward_list, pf) print(reward_list)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0' ]) assert continuous_actions == True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) log_traj_folder = './logs/{0}'.format(args.output_traj_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(log_traj_folder): os.makedirs(log_traj_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) # log_reward_total_file = open('./logs/reward_total.txt', 'a') # log_reward_dist_file = open('./logs/reward_dist.txt', 'a') # log_reward_col_file = open('./logs/reward_col.txt', 'a') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # print(sampler.envs.observation_space.shape) # print(sampler.envs.action_space.shape) # eewfe if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # baseline = LinearFeatureBaseline( # int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) resume_training = True if resume_training: saved_policy_path = os.path.join( './TrainingResults/result2//saves/{0}'.format( 'maml-2DNavigation-dir'), 'policy-180.pt') if os.path.isfile(saved_policy_path): print('Loading a saved policy') policy_info = torch.load(saved_policy_path) policy.load_state_dict(policy_info) else: sys.exit("The requested policy does not exist for loading") metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) start_time = time.time() for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # print("observations shape: ") # print(episodes[0][1].observations.shape) # ewerw # Tensorboard total_reward_be, dist_reward_be, col_reward_be = total_rewards( [ep.rewards for ep, _ in episodes]) total_reward_af, dist_reward_af, col_reward_af = total_rewards( [ep.rewards for _, ep in episodes]) log_reward_total_file = open('./logs/reward_total.txt', 'a') log_reward_dist_file = open('./logs/reward_dist.txt', 'a') log_reward_col_file = open('./logs/reward_col.txt', 'a') log_reward_total_file.write( str(batch) + ',' + str(total_reward_be) + ',' + str(total_reward_af) + '\n') log_reward_dist_file.write( str(batch) + ',' + str(dist_reward_be) + ',' + str(dist_reward_af) + '\n') log_reward_col_file.write( str(batch) + ',' + str(col_reward_be) + ',' + str(col_reward_af) + '\n') log_reward_total_file.close( ) # not sure if open and close immediantly will help save the appended logs in-place log_reward_dist_file.close() log_reward_col_file.close() writer.add_scalar('total_rewards/before_update', total_reward_be, batch) writer.add_scalar('total_rewards/after_update', total_reward_af, batch) writer.add_scalar('distance_reward/before_update', dist_reward_be, batch) writer.add_scalar('distance_reward/after_update', dist_reward_af, batch) writer.add_scalar('collison_rewards/before_update', col_reward_be, batch) writer.add_scalar('collison_rewards/after_update', col_reward_af, batch) if batch % args.save_every == 0: # maybe it can save time/space if the models are saved only periodically # Save policy network print('Saving model {}'.format(batch)) with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) if batch % 30 == 0: with open( os.path.join( log_traj_folder, 'train_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join( log_traj_folder, 'valid_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'train_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'valid_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) # save tasks # a sample task list of 2: [{'goal': array([0.0209588 , 0.15981938])}, {'goal': array([0.45034602, 0.17282322])}] with open( os.path.join(log_traj_folder, 'tasks_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump(tasks, f) else: # supposed to be overwritten for each batch with open( os.path.join(log_traj_folder, 'latest_train_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join(log_traj_folder, 'latest_valid_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_train_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_valid_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) with open(os.path.join(log_traj_folder, 'latest_tasks.pkl'), 'wb') as f: pickle.dump(tasks, f) print('finished epoch {}; time elapsed: {}'.format( batch, time_elapsed(time.time() - start_time)))
num_workers=args.num_workers) if continuous_actions: the_model = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: the_model = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) #loading the model save_folder = './saves/{0}'.format(args.output_folder) the_model.load_state_dict( torch.load(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, the_model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) env = gym.make(args.env_name) # new task!
def train_meta_learning_model(args): # import matplotlib.pyplot as plt # import matplotlib.animation as animation # from matplotlib import style # style.use('fivethirtyeight') # fig = plt.figure() # ax1 = fig.add_subplot(1,1,1) # xs = [] # ys = [] # def animate(i): # ax1.clear() # ax1.plot(xs, ys) rewards_before_ml = [] rewards_after_ml = [] continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_metalearned')) save_folder = './saves/{0}'.format(args.output_folder + '_metalearned') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) torch.manual_seed(args.random_seed) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: metalearned_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(metalearned_model): policy.load_state_dict(torch.load(metalearned_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, sampling_type=args.sampling_type, points_per_dim=args.points_per_dim) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample(tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) reward_before_ml = total_rewards([ep.rewards for ep, _ in episodes], args.gamma) reward_after_ml = total_rewards([ep.rewards for _, ep in episodes], args.gamma) print('Before Update: {} After Update: {}'.format( reward_before_ml, reward_after_ml)) # experiment.log_metric("Avg Reward Before Update (MetaLearned)", reward_before_ml) experiment.log_metric("Avg Reward", reward_after_ml, batch + 1) rewards_before_ml.append(reward_before_ml) rewards_after_ml.append(reward_after_ml) # xs.append(batch+1) # ys.append(total_rewards([ep.rewards for _, ep in episodes], args.gamma)) # ani = animation.FuncAnimation(fig, animate, interval=1000) # plt.savefig('navg_baseline_monitor') # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # episodes = metalearner.sample(tasks, first_order=args.first_order) # print("Avg Reward After Update (MetaLearned)", total_rewards([ep.rewards for _, ep in episodes], args.gamma)) testing_sampler = BatchSampler(args.env_name, batch_size=args.testing_fbs, num_workers=args.num_workers) testing_metalearner = MetaLearner(testing_sampler, metalearner.policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) test_tasks = testing_sampler.sample_tasks(num_tasks=args.testing_mbs, sampling_type='rand', points_per_dim=-1) test_episodes = testing_metalearner.sample(test_tasks, first_order=args.first_order, no_update=True) test_reward = total_rewards([ep.rewards for ep in test_episodes], args.gamma) print('-------------------------------------------------') print('Test Time reward is: ' + str(test_reward)) print('-------------------------------------------------') pickle_reward_data_file = os.path.join(save_folder, 'reward_data.pkl') with open(pickle_reward_data_file, 'wb') as f: pickle.dump(rewards_before_ml, f) pickle.dump(rewards_after_ml, f) pickle_final_reward_file = os.path.join(save_folder, 'final_reward.pkl') with open(pickle_final_reward_file, 'wb') as f: pickle.dump(test_reward, f) return
def k_shot_experiments(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy_pretrained = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_metalearned = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_random = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy_pretrained = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_metalearned = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) policy_random = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # save_folder_pretrained = './saves/{0}'.format(args.output_folder + '_pretrained') # pretrained_model = os.path.join(save_folder_pretrained, 'policy-{0}.pt'.format(args.num_batches-1)) # policy_pretrained.load_state_dict(torch.load(pretrained_model)) save_folder_metalearned = './saves/{0}'.format(args.output_folder + '_metalearned') metalearned_model = os.path.join( save_folder_metalearned, 'policy-{0}.pt'.format(args.num_batches - 1)) policy_metalearned.load_state_dict(torch.load(metalearned_model)) # metalearned_tester = k_shot_tester(args.K_shot_batch_num, policy_metalearned, args.K_shot_batch_size, args.K_shot_num_tasks, 'MetaLearned', args) # avg_discounted_returns_metalearned = metalearned_tester.run_k_shot_exp() # print('Metalearned KSHOT result: ', avg_discounted_returns_metalearned) # print('Mean: ', torch.mean(avg_discounted_returns_metalearned, 0)) results_folder = './saves/{0}'.format(args.output_folder + '_results') if not os.path.exists(results_folder): os.makedirs(results_folder) kshot_fig_path1 = os.path.join(results_folder, 'kshot_testing') # kshot_fig_path2 = os.path.join(results_folder, 'ml_pre_diff') result_data_path = os.path.join(results_folder, 'data_') metalearned_tester = k_shot_tester(args.K_shot_batch_num, policy_metalearned, args.K_shot_batch_size, args.K_shot_num_tasks, 'MetaLearned', args) avg_discounted_returns_metalearned = metalearned_tester.run_k_shot_exp() # pretrained_tester = k_shot_tester(args.K_shot_batch_num, policy_pretrained, args.K_shot_batch_size, args.K_shot_num_tasks, 'Pretrained', args) # avg_discounted_returns_pretrained = pretrained_tester.run_k_shot_exp() # random_tester = k_shot_tester(args.K_shot_batch_num, policy_random, args.K_shot_batch_size, args.K_shot_num_tasks, 'Random', args) # avg_discounted_returns_random = random_tester.run_k_shot_exp() plt.figure('K Shot: Testing Curves') # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_pretrained, color=np.array([0.,0.,1.]), label='Pre-Trained') # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_metalearned, color=np.array([0.,1.,0.]), label='Meta-Learned') # plt.plot([i for i in range(args.K_shot_batch_num + 1)], avg_discounted_returns_random, color=np.array([0.,0.,0.]), label='Random') # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_pretrained, 0).tolist(), torch.std(avg_discounted_returns_pretrained, 0).tolist(), color=np.array([0.,0.,1.]), label='Pre-Trained', capsize=5, capthick=2) plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_metalearned, 0).tolist(), torch.std(avg_discounted_returns_metalearned, 0).tolist(), color=np.array([0., 1., 0.]), label='Meta-Learned', capsize=5, capthick=2) # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_random, 0).tolist(), torch.std(avg_discounted_returns_random, 0).tolist(), color=np.array([0.,0.,0.]), label='Random', capsize=5, capthick=2) plt.xlabel('Gradient Descent Iteration Number') plt.ylabel('Average Discounted Return') plt.title('K Shot: Testing Curves') plt.legend(loc='upper left') plt.savefig(kshot_fig_path1) # plt.show() # plt.figure('K Shot: Difference between Metalearned and Pretrained') # plt.errorbar([i for i in range(args.K_shot_batch_num + 1)], torch.mean(avg_discounted_returns_metalearned-avg_discounted_returns_pretrained, 0).tolist(), torch.std(avg_discounted_returns_metalearned-avg_discounted_returns_pretrained, 0).tolist(), color=np.array([0.,0.,0.]), capsize=5, capthick=2) # plt.xlabel('Gradient Descent Iteration Number') # plt.ylabel('Average Discounted Return Difference') # plt.title('K Shot: Difference between Metalearned and Pretrained') # plt.savefig(kshot_fig_path2) # plt.show() #save torch tensor results to combine with other experiments # torch.save(avg_discounted_returns_pretrained, result_data_path + 'pretrained') torch.save(avg_discounted_returns_metalearned, result_data_path + 'metalearned') return
def train_pretrained_model(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained')) save_folder = './saves/{0}'.format(args.output_folder + '_pretrained') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning sampler = BatchSampler(args.env_name, batch_size=2 * args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: pretrained_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(pretrained_model): policy.load_state_dict(torch.load(pretrained_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample_for_pretraining( tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() params = metalearner.adapt(episodes, first_order=args.first_order) metalearner.policy.load_state_dict(params, strict=True) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) return
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DPointEnvCorner-v0']) save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) if args.output_folder!='maml-trial' and args.output_folder!='trial': i=0 while os.path.exists(save_folder): args.output_folder=str(i+1) i+=1 save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder) os.makedirs(save_folder) writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder)) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.load_dir is not None: policy.load_state_dict(torch.load(args.load_dir)) metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Plotting figure # plotting(episodes, batch, save_folder,args.num_plots) if args.load_dir is not None: sys.exit(0) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
indexes = [399] num_test_tasks = 100 buckets = 1 successes = [] for index in indexes: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) model = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) checkpoint = torch.load( '../final_models/meta/{0}/policy-{1}.pt'.format( args.to_pickle, index)) model.load_state_dict(checkpoint) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) task_success = [] for _ in range(buckets): tasks = env.unwrapped.sample_tasks(num_test_tasks) success = 0