def main(args): args.output_folder = args.env_name # TODO continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0']) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: # config = {k: v for (k, v) in vars(args).iteritems() if k != 'device'} config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print(config) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), # input shape int(np.prod(sampler.envs.action_space.shape)), # output shape hidden_sizes=(args.hidden_size,) * args.num_layers) # [100, 100] else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): # number of epoches tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # # Save policy network # with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: # torch.save(policy.state_dict(), f) print(batch, total_rewards([ep.rewards for ep, _ in episodes]), total_rewards([ep.rewards for _, ep in episodes]))
def main(args): continuous_actions = True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): print("========== BATCH NUMBER {0} ==========".format(batch)) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch + 256)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVelEnv-v1', 'AntDirEnv-v1', 'HalfCheetahVelEnv-v1', 'HalfCheetahDirEnv-v1', '2DNavigation-v0' ]) save_folder = os.path.join('tmp', args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) # Load model with open(args.model, 'rb') as f: state_dict = torch.load(f) policy.load_state_dict(state_dict) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) args.meta_batch_size = 81 # velocities = np.linspace(-1., 3., num=args.meta_batch_size) # tasks = [{'velocity': velocity} for velocity in velocities] tasks = [{'direction': direction} for direction in [-1, 1]] for batch in range(args.num_batches): episodes = metalearner.sample(tasks) train_returns = [ep.rewards.sum(0).cpu().numpy() for ep, _ in episodes] valid_returns = [ep.rewards.sum(0).cpu().numpy() for _, ep in episodes] with open(os.path.join(save_folder, '{0}.npz'.format(batch)), 'wb') as f: np.savez(f, train=train_returns, valid=valid_returns) print('Batch {0}'.format(batch))
def main(args): np.random.seed(RANDOM_SEED) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = MrcBatchSampler(args.env_name, batch_size=args.fast_batch_size, train_folder=TRAIN_TRACES) policy = ActorNet(input_size=[S_INFO, S_LEN], output_size=A_DIM) baseline = CriticNet(input_size=[S_INFO, S_LEN], output_size=A_DIM) # baseline.load_state_dict(torch.load(os.path.join(save_folder, 'baseline-2000.pt'))) # policy.load_state_dict(torch.load(os.path.join(save_folder, 'policy-2000.pt'))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): print() print("===================================================================") print("=====================Now epoch: ", batch, "========================") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print("total_rewards/before_update ", total_rewards([ep.rewards for ep, _ in episodes])) print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes])) # # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) if not batch % 5: metaTest(policy, baseline, batch, args.meta_batch_size, args.fast_batch_size) # Save policy network with open(os.path.join(save_folder, 'meta-policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) with open(os.path.join(save_folder, 'meat-baseline-{0}.pt'.format(batch)), 'wb') as f: torch.save(baseline.state_dict(), f)
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if args.env_name == 'AntVel-v1': param_bounds = {"goal": [0, 3]} if args.env_name == 'AntPos-v0': param_bounds = {"x": [-3, 3], "y": [-3, 3]} teacher = TeacherController(args.teacher, args.nb_test_episodes, param_bounds, seed=args.seed, teacher_params={}) tree = TreeLSTM(args.tree_hidden_layer, len(param_bounds.keys()), args.cluster_0, args.cluster_1, device=args.device) if continuous_actions: policy = NormalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) else: policy = CategoricalMLPPolicy(int( np.prod(sampler.envs.observation_space.shape) + args.tree_hidden_layer), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers, tree=tree) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape)) + args.tree_hidden_layer) metalearner = MetaLearner(sampler, policy, baseline, tree=tree, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) all_tasks = [] for batch in range(args.num_batches): print("starting iteration {}".format(batch)) tasks = [] for _ in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': tasks.append( {"position": teacher.task_generator.sample_task()}) if args.env_name == 'AntVel-v1': tasks.append( {"velocity": teacher.task_generator.sample_task()[0]}) all_tasks.append(tasks) # tasks = np.array(tasks) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) with open('./logs/{0}/task_list.pkl'.format(args.output_folder), 'wb') as pf: pickle.dump(all_tasks, pf) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) tr = [ep.rewards for _, ep in episodes] tr = [torch.mean(torch.sum(rewards, dim=0)).item() for rewards in tr] print("rewards:", tr) for t in range(args.meta_batch_size): if args.env_name == 'AntPos-v0': teacher.task_generator.update(tasks[t]["position"], tr[t]) if args.env_name == 'AntVel-v1': teacher.task_generator.update(np.array([tasks[t]["velocity"]]), tr[t]) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) # Save tree torch.save(tree, os.path.join(save_folder, 'tree-{0}.pt'.format(batch)))
int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) #loading the model save_folder = './saves/{0}'.format(args.output_folder) the_model.load_state_dict( torch.load(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, the_model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) env = gym.make(args.env_name) # new task! episodes = [] #randomly sample task test_task = sampler.sample_tasks(num_tasks=1) #set specific task. #test_task = [] #test_task.append({'velocity': 1.9})
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0' ]) assert continuous_actions == True writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) log_traj_folder = './logs/{0}'.format(args.output_traj_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) if not os.path.exists(log_traj_folder): os.makedirs(log_traj_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) # log_reward_total_file = open('./logs/reward_total.txt', 'a') # log_reward_dist_file = open('./logs/reward_dist.txt', 'a') # log_reward_col_file = open('./logs/reward_col.txt', 'a') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # print(sampler.envs.observation_space.shape) # print(sampler.envs.action_space.shape) # eewfe if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) # baseline = LinearFeatureBaseline( # int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) resume_training = True if resume_training: saved_policy_path = os.path.join( './TrainingResults/result2//saves/{0}'.format( 'maml-2DNavigation-dir'), 'policy-180.pt') if os.path.isfile(saved_policy_path): print('Loading a saved policy') policy_info = torch.load(saved_policy_path) policy.load_state_dict(policy_info) else: sys.exit("The requested policy does not exist for loading") metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) start_time = time.time() for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # print("observations shape: ") # print(episodes[0][1].observations.shape) # ewerw # Tensorboard total_reward_be, dist_reward_be, col_reward_be = total_rewards( [ep.rewards for ep, _ in episodes]) total_reward_af, dist_reward_af, col_reward_af = total_rewards( [ep.rewards for _, ep in episodes]) log_reward_total_file = open('./logs/reward_total.txt', 'a') log_reward_dist_file = open('./logs/reward_dist.txt', 'a') log_reward_col_file = open('./logs/reward_col.txt', 'a') log_reward_total_file.write( str(batch) + ',' + str(total_reward_be) + ',' + str(total_reward_af) + '\n') log_reward_dist_file.write( str(batch) + ',' + str(dist_reward_be) + ',' + str(dist_reward_af) + '\n') log_reward_col_file.write( str(batch) + ',' + str(col_reward_be) + ',' + str(col_reward_af) + '\n') log_reward_total_file.close( ) # not sure if open and close immediantly will help save the appended logs in-place log_reward_dist_file.close() log_reward_col_file.close() writer.add_scalar('total_rewards/before_update', total_reward_be, batch) writer.add_scalar('total_rewards/after_update', total_reward_af, batch) writer.add_scalar('distance_reward/before_update', dist_reward_be, batch) writer.add_scalar('distance_reward/after_update', dist_reward_af, batch) writer.add_scalar('collison_rewards/before_update', col_reward_be, batch) writer.add_scalar('collison_rewards/after_update', col_reward_af, batch) if batch % args.save_every == 0: # maybe it can save time/space if the models are saved only periodically # Save policy network print('Saving model {}'.format(batch)) with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) if batch % 30 == 0: with open( os.path.join( log_traj_folder, 'train_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join( log_traj_folder, 'valid_episodes_observ_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'train_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'valid_episodes_ped_state_'+str(batch)+'.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) # save tasks # a sample task list of 2: [{'goal': array([0.0209588 , 0.15981938])}, {'goal': array([0.45034602, 0.17282322])}] with open( os.path.join(log_traj_folder, 'tasks_' + str(batch) + '.pkl'), 'wb') as f: pickle.dump(tasks, f) else: # supposed to be overwritten for each batch with open( os.path.join(log_traj_folder, 'latest_train_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for ep, _ in episodes], f) with open( os.path.join(log_traj_folder, 'latest_valid_episodes_observ.pkl'), 'wb') as f: pickle.dump( [ep.observations.cpu().numpy() for _, ep in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_train_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for ep, _ in episodes], f) # with open(os.path.join(log_traj_folder, 'latest_valid_episodes_ped_state.pkl'), 'wb') as f: # pickle.dump([ep.hid_observations.cpu().numpy() for _, ep in episodes], f) with open(os.path.join(log_traj_folder, 'latest_tasks.pkl'), 'wb') as f: pickle.dump(tasks, f) print('finished epoch {}; time elapsed: {}'.format( batch, time_elapsed(time.time() - start_time)))
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Pusher' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) if not args.hierarchical: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for i, batch in enumerate(range(args.num_batches)): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('Total Rewards', str(total_rewards([ep.rewards for _, ep in episodes]))) # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if (i + 1) % args.save_every == 0: # Save policy network with open( os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy, f) else: sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) # Get the policies higher_policy, lower_trainer, baseline = hierarchical_meta_policy( args.env_name, args.skills_dim, sampler=sampler, net_size=args.hidden_size, output_size=1) # Define the hierarchical meta learner hr_meta_learner = HierarchicalMetaLearner(sampler, higher_policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # Training procedure for i, batch in enumerate(range(args.num_batches)): # Train the lower level policy lower_trainer.train() # Now freeze the lower level policy lower_networks = lower_trainer.networks lower_policy = lower_networks[0] lower_policy.trainable = False # Sample the different tasks tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # Sample the different episodes for the different tasks episodes = hr_meta_learner.sample(tasks, lower_policy, first_order=args.first_order) hr_meta_learner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('Total Rewards', str(total_rewards([ep.rewards for _, ep in episodes]))) lower_policy.trainable = True # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if (i + 1) % args.save_every == 0: # Save the policy networks with open( os.path.join(save_folder, 'h_policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(higher_policy, f) with open( os.path.join(save_folder, 'l_policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(lower_policy, f) with open(os.path.join(save_folder, 'baseline.pt'), 'wb') as f: torch.save(baseline, f)
def main(args): group_name = ''.join([ random.choice(string.ascii_letters + string.digits) for n in range(4) ]) wandb.init(group=group_name, job_type='optimizer', tensorboard=True) wandb.config.update(args) device = torch.device(args.device) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=device.type) json.dump(config, f, indent=2) sampler = BatchSampler(group_name, args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): logging.basicConfig(filename=args.debug_file, level=logging.WARNING, filemode='w') logging.getLogger('metalearner').setLevel(logging.INFO) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'PendulumTheta-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) #if args.baseline == 'critic shared': # policy = NormalMLPPolicyA2C(int(np.prod(sampler.envs.observation_space.shape)), # int(np.prod(sampler.envs.action_space.shape)), # hidden_sizes=(args.hidden_size,) * args.num_layers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) if args.baseline == 'linear': baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) elif args.baseline == 'critic separate': baseline = CriticFunction( int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) #elif args.baseline == 'critic shared': # RANJANI TO DO metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, baseline_type=args.baseline, cliprange=args.cliprange, noptepochs=args.noptepochs, usePPO=args.usePPO, nminibatches=args.nminibatches, ppo_lr=args.ppo_lr, useSGD=args.useSGD, ppo_momentum=args.ppo_momentum, grad_clip=args.grad_clip) for batch in range(args.num_batches): print("*********************** Batch: " + str(batch) + " ****************************") print("Creating tasks...") tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) print("Creating episodes...") episodes, grad_norm = metalearner.sample(tasks, first_order=args.first_order) print("Taking a meta step...") metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print("Writing results to tensorboard...") # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) if grad_norm: writer.add_scalar('PPO mb grad norm', np.average(grad_norm)) print(np.average(grad_norm)) print("Saving policy network...") # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) print("***************************************************")
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) writer = SummaryWriter(log_dir=args.log_dir) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) logger.logkv('return_avg_pre', total_rewards([ep.rewards for ep, _ in episodes])) logger.logkv('return_avg_post', total_rewards([ep.rewards for _, ep in episodes])) logger.dumpkvs()
def main(args): set_random_seed(args.random) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DNavigationBiased-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.alg)) save_folder = './saves/{0}'.format(args.alg) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, seed=args.random) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.alg == 'simul': # vanilla maml metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.meta_policy_num * args.num_batches): # first sample tasks under the distribution tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # get episodes in the form of (train episodes, test episodes after adaption) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar( 'maml/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'maml/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f) elif args.alg == 'greedy': # multi-policy maml metalearner = KPolicyMetaLearner(sampler, policy, baseline, args.meta_policy_num, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # visualize the poolicies' behavior trajectories = [] for policy_idx in range(args.meta_policy_num): print(policy_idx) metalearner.optimize_policy_index(policy_idx) for batch in range(args.num_batches): print('batch num %d' % batch) tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) metalearner.evaluate_optimized_policies(tasks) episodes = metalearner.sample(tasks, first_order=args.first_order) # loss is computed inside, then update policies metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # not sure what to write in tensorboard... for epIdx in range(len(episodes)): writer.add_scalar( 'kmaml/pi_' + str(policy_idx) + '_task_' + str(epIdx), total_rewards([episodes[epIdx][1].rewards]), batch) # use a random task (no update here anyway) to visualize meta-policies tasks = sampler.sample_tasks(num_tasks=1) trajectories.append(metalearner.sample_meta_policy(tasks[0])) plotTrajectories(trajectories)
the_model = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) the_model.load_state_dict( torch.load(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)))) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, the_model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) test_batch_size = 2 test_reward_before = [] test_reward_after = [] for test_batch in range(test_batch_size): #sample one task test_task = sampler.sample_tasks(num_tasks=1) print("test_task: ", test_task) sampler.reset_task(test_task[0]) #sample some episodes for that task
def main(args): save_folder = f'saves/{args.output_folder + get_date_str()}' if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) print('Initializing samplers...') sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) test_sampler = BatchSampler(args.env_name, test_env=True, batch_size=args.fast_batch_size, num_workers=max(1, args.num_workers // 2)) policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) print('Initializing meta-learners...') metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 # NOTE: we need this metalearner only to sample test tasks test_metalearner = MetaLearner(test_sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) # noqa: E128 print('Starting the training') # Initialize logging wandb.init() wandb.config.update(args) task_name2id = {name: i for i, name in enumerate(sampler._env._task_names)} task_id2name = sampler._env._task_names task2prob = np.ones(sampler._env.num_tasks) / sampler._env.num_tasks uniform = np.ones_like(task2prob) / sampler._env.num_tasks # outer loop (meta-training) for i in range(args.num_batches): print(f'Batch {i}') # sample trajectories from random tasks print(f'\tSampling a batch of {args.meta_batch_size} training tasks') tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, task2prob=0.99 * task2prob + 0.01 * uniform) # Note: Dirty hack to overcome metaworld dirty hack task_names = [sampler._env._task_names[t['task']] for t in tasks] # inner loop (adaptation) # returns list of tuples (train_episodes, valid_episodes) print(f'\tTraining') episodes = metalearner.sample(tasks, first_order=args.first_order) print(f'\tUpdating the meta-model') metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging # before: before parameters update # after: after parameters adaptation to the task r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards/before_update': r_before, 'total_rewards/after_update': r_after, 'success_rate/before_update': success_rate_before, 'success_rate/after_update': success_rate_after, 'success_rate/improvement': success_rate_after - success_rate_before, 'success_rate/before_update_macro': np.mean(list(task_success_rate_before.values())), 'success_rate/after_update_macro': np.mean(list(task_success_rate_after.values())), }, step=i) wandb.log( { f'success_rate/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, step=i) wandb.log( { f'success_rate/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, step=i) wandb.log( { f'success_rate/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) wandb.log( { f'n_acquired_tasks/before_update/at_{x}': sum( rate > x for rate in task_success_rate_before.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) wandb.log( { f'n_acquired_tasks/after_update/at_{x}': sum( rate > x for rate in task_success_rate_after.values()) for x in [0.001, 0.01, 0.05, 0.1, 0.5] }, step=i) if args.active_learning: new_task2prob = np.zeros_like(task2prob) if args.prob_f == 'linear': norm = 1e-7 + sum(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = 1. - rate / norm elif args.prob_f == 'softmax': # softmax(1 - rate) # numerical stability trick # http://cs231n.github.io/linear-classify/#softmax max_f = 1 - min(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] f = 1 - rate new_task2prob[task_id] = np.exp( (f - max_f) / args.temperature) new_task2prob = new_task2prob / (1e-7 + sum(new_task2prob)) elif args.prob_f == 'softmax2': # 1 - softmax(rate) max_f = max(task_success_rate_after.values()) for task, rate in task_success_rate_after.items(): task_id = task_name2id[task] new_task2prob[task_id] = np.exp( (rate - max_f) / args.temperature) new_task2prob = 1. - new_task2prob / (1e-7 + sum(new_task2prob)) else: raise RuntimeError( 'prob-f should be either "softmax", "softmax2" or "linear"' ) alpha = args.success_rate_smoothing task2prob = alpha * task2prob + (1 - alpha) * new_task2prob task2prob /= sum(task2prob) assert all(task2prob > 0) # strictly! wandb.log( { f'task2prob/{task_id2name[task_id]}': prob for task_id, prob in enumerate(task2prob) }, step=i) # meta-test if i % args.eval_every == 0: print(f'Evaluating on meta-test') # save policy network _save_path = os.path.join(save_folder, 'policy-{0}.pt'.format(i)) with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path) # Evaluate on meta-test tasks = test_sampler.sample_tasks(num_tasks=2 * args.meta_batch_size) # Note: Dirty hack to overcome metaworld dirty hack task_names = [ test_sampler._env._task_names[t['task']] for t in tasks ] episodes = test_metalearner.sample(tasks, first_order=args.first_order) r_before = total_rewards([ep.rewards for ep, _ in episodes]) r_after = total_rewards([ep.rewards for _, ep in episodes]) test_episode_infos = [ep._info_list for ep, _ in episodes] success_rate_before, task_success_rate_before = get_success_rate( test_episode_infos, task_names, per_task=True) test_episode_infos = [ep._info_list for _, ep in episodes] success_rate_after, task_success_rate_after = get_success_rate( test_episode_infos, task_names, per_task=True) wandb.log( { 'total_rewards_test/before_update': r_before, 'total_rewards_test/after_update': r_after, 'success_rate_test/before_update': success_rate_before, 'success_rate_test/after_update': success_rate_after, 'success_rate_test/improvement': success_rate_after - success_rate_before }, step=i) wandb.log( { f'success_rate_test/after_update/{task}': rate for task, rate in task_success_rate_after.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/before_update/{task}': rate for task, rate in task_success_rate_before.items() }, # noqa: E501 step=i) wandb.log( { f'success_rate_test/imrovement/{task}': task_success_rate_after[task] - task_success_rate_before[task] for task in task_success_rate_before.keys() }, step=i) print('Saving the final model') # save final policy _save_path = os.path.join(save_folder, 'policy-final.pt') with open(_save_path, 'wb') as f: torch.save(policy.state_dict(), f) wandb.save(_save_path)
def main(args): random.seed(args.seed) np.random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) torch.manual_seed(args.seed) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner( sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, q_inner=args.inner_q == 'true', q_residuce_gradient=args.inner_q_residue_gradient == 'true', q_soft=args.inner_q_soft == 'true', q_soft_temp=args.inner_q_soft_temp, device=args.device, ) for batch in range(args.num_batches): if args.device.type == 'cuda': torch.cuda.empty_cache() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes, adaptation_info = metalearner.sample( tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard pre_update_rewards = total_rewards([ep.rewards for ep, _ in episodes]) post_update_rewards = total_rewards([ep.rewards for _, ep in episodes]) writer.add_scalar('total_rewards/before_update', pre_update_rewards, batch) writer.add_scalar('total_rewards/after_update', post_update_rewards, batch) writer.add_scalar('total_rewards/rewards_improvement', post_update_rewards - pre_update_rewards, batch) writer.add_scalar('adaptation/pre_update_inner_loss', adaptation_info.mean_pre_update_loss, batch) writer.add_scalar('adaptation/post_update_inner_loss', adaptation_info.mean_post_update_loss, batch) writer.add_scalar('adaptation/inner_loss_improvement', adaptation_info.mean_loss_improvment, batch) writer.add_scalar('adaptation/weight_change', adaptation_info.mean_weight_change, batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): wandb.config.update( {k: v for k, v in vars(args).items() if k in ['env_name', 'tau']}) continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, args.seed, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) critic = Critic(int(np.prod(sampler.envs.observation_space.shape)), 1, hidden_sizes=(args.hidden_size, ) * args.num_layers) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Logging wandb.log( { 'total_rewards/before_update': total_rewards([ep.rewards for ep, _ in episodes]) }, step=batch) wandb.log( { 'total_rewards/after_update': total_rewards([ep.rewards for _, ep in episodes]) }, step=batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def run_meta_training(self, final_model_folder, policy=None): parser = argparse.ArgumentParser( description='Reinforcement learning with ' 'Model-Agnostic Meta-Learning (MAML)') # General parser.add_argument('--env-name', type=str, help='name of the environment', default='BiddingMDP-v0') parser.add_argument('--gamma', type=float, default=0.95, help='value of the discount factor gamma') parser.add_argument('--tau', type=float, default=1.0, help='value of the discount factor for GAE') parser.add_argument('--first-order', action='store_true', help='use the first-order approximation of MAML') # Policy network (relu activation function) #parser.add_argument('--hidden-size', type=int, default=50, parser.add_argument('--hidden-size', type=int, default=200, help='number of hidden units per layer') parser.add_argument('--num-layers', type=int, default=2, help='number of hidden layers') # Task-specific parser.add_argument('--fast-batch-size', type=int, default=20, help='batch size for each individual task') parser.add_argument( '--fast-lr', type=float, default=0.5, help='learning rate for the 1-step gradient update of MAML') # Optimization parser.add_argument( '--num-batches', type=int, default=32, # parser.add_argument('--num-batches', type=int, default=32, # parser.add_argument('--num-batches', type=int, default=50, help='number of batches') #parser.add_argument('--meta-batch-size', type=int, default=50, # parser.add_argument('--meta-batch-size', type=int, default=50, parser.add_argument('--meta-batch-size', type=int, default=2, help='number of tasks per batch') parser.add_argument('--max-kl', type=float, default=1e-2, help='maximum value for the KL constraint in TRPO') parser.add_argument('--cg-iters', type=int, default=10, help='number of iterations of conjugate gradient') parser.add_argument('--cg-damping', type=float, default=1e-5, help='damping in conjugate gradient') # parser.add_argument('--ls-max-steps', type=int, default=2, parser.add_argument( '--ls-max-steps', type=int, default=15, # parser.add_argument('--ls-max-steps', type=int, default=15, help='maximum number of iterations for line search') parser.add_argument( '--ls-backtrack-ratio', type=float, default=0.8, help='maximum number of iterations for line search') # Miscellaneous parser.add_argument('--output-folder', type=str, default='maml', help='name of the output folder') # parser.add_argument('--num-workers', type=int, default=mp.cpu_count() - 2, parser.add_argument('--num-workers', type=int, default=4, help='number of workers for trajectories sampling') parser.add_argument('--device', type=str, default='cuda', help='set the device (cpu or cuda)') args = parser.parse_args() self.fast_batch_size = args.fast_batch_size self.max_kl = args.max_kl self.cg_iters = args.cg_iters self.first_order = args.first_order self.cg_damping = args.cg_damping self.ls_max_steps = args.ls_max_steps self.ls_backtrack_ratio = args.ls_backtrack_ratio self.output_folder = args.output_folder self.num_batches = args.num_batches continuous_actions = (args.env_name in [ 'BiddingMDP-v0', 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0' ]) # Create logs and saves folder if they don't exist if not os.path.exists('./logs'): os.makedirs('./logs') if not os.path.exists('./saves'): os.makedirs('./saves') # Device # args.device = torch.device(args.device # if torch.cuda.is_available() else 'cpu') args.device = torch.device("cpu") writer = SummaryWriter('./logs/{0}'.format(args.output_folder)) save_folder = './saves/{0}'.format(args.output_folder) if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if policy is None and continuous_actions: print("CREATING POLICY WHEN IT SHOULD NOT") exit() policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) self.policy = policy elif policy is None: print("CREATING POLICY WHEN IT SHOULD NOT") exit() policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) self.policy = policy metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) # Tensorboard writer.add_scalar( 'total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar( 'total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) torch.cuda.empty_cache() # Save policy network final_model_path = final_model_folder + "meta_rl_gamma_policy_{}.pt".format( batch) with open(final_model_path, 'wb') as f: torch.save(policy.state_dict(), f) self.metalearner = metalearner return final_model_path
def main(args): continuous_actions = (args.env_name in ['AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', '2DPointEnvCorner-v0']) save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) if args.output_folder!='maml-trial' and args.output_folder!='trial': i=0 while os.path.exists(save_folder): args.output_folder=str(i+1) i+=1 save_folder = './saves/{0}'.format(args.env_name+'/'+args.output_folder) log_directory = './logs/{0}'.format(args.env_name+'/'+args.output_folder) os.makedirs(save_folder) writer = SummaryWriter('./logs/{0}'.format(args.env_name+'/'+args.output_folder)) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size,) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size,) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) if args.load_dir is not None: policy.load_state_dict(torch.load(args.load_dir)) metalearner = MetaLearner(sampler, policy, baseline, args, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(args.num_batches): tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) print('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) print('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Plotting figure # plotting(episodes, batch, save_folder,args.num_plots) if args.load_dir is not None: sys.exit(0) # Tensorboard writer.add_scalar('total_rewards/before_update', total_rewards([ep.rewards for ep, _ in episodes]), batch) writer.add_scalar('total_rewards/after_update', total_rewards([ep.rewards for _, ep in episodes]), batch) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(policy.state_dict(), f)
def main(args): env_name = 'RVONavigationAll-v0' #['2DNavigation-v0', 'RVONavigation-v0', 'RVONavigationAll-v0'] test_folder = './{0}'.format('test_nav') fast_batch_size = 40 # number of trajectories saved_policy_file = os.path.join( './TrainingResults/result3/saves/{0}'.format('maml-2DNavigation-dir'), 'policy-180.pt') sampler = BatchSampler(env_name, batch_size=fast_batch_size, num_workers=3) policy = NormalMLPPolicy(int(np.prod( sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(100, ) * 2) # Loading policy if os.path.isfile(saved_policy_file): policy_info = torch.load(saved_policy_file, map_location=lambda storage, loc: storage) policy.load_state_dict(policy_info) print('Loaded saved policy') else: sys.exit("The requested policy does not exist for loading") # Creating test folder if not os.path.exists(test_folder): os.makedirs(test_folder) # Generate tasks # goal = [[-0.8, 0.9]] # task = [{'goal': goal}][0] tasks = sampler.sample_tasks(num_tasks=1) task = tasks[0] # Start validation print("Starting to test...Total step = ", args.grad_steps) start_time = time.time() # baseline = LinearFeatureBaseline(int(np.prod(sampler.envs.observation_space.shape))) baseline = LinearFeatureBaseline(int(np.prod((2, )))) metalearner = MetaLearner(sampler, policy, baseline, gamma=0.9, fast_lr=0.01, tau=0.99, device='cpu') # test_episodes = metalearner.sample(tasks) # for train, valid in test_episodes: # total_reward, dist_reward, col_reward = total_rewards(train.rewards) # print(total_reward) # total_reward, dist_reward, col_reward = total_rewards(valid.rewards) # print(total_reward) test_episodes = metalearner.test(task, n_grad=args.grad_steps) print('-------------------') for n_grad, ep in test_episodes: total_reward, dist_reward, col_reward = total_rewards(ep.rewards) print(total_reward) # with open(os.path.join(test_folder, 'test_episodes_grad'+str(n_grad)+'.pkl'), 'wb') as f: # pickle.dump([ep.observations.cpu().numpy(), ep], f) # with open(os.path.join(test_folder, 'task.pkl'), 'wb') as f: # pickle.dump(task, f) print('Finished test. Time elapsed = {}'.format( time_elapsed(time.time() - start_time)))
def main(args): # Setup for logging tb_writer = SummaryWriter('./logs/tb_{}'.format( args.log_name)) # Tensorboard logging log = set_log(args) # Setup before meta-train starts sampler = BatchSampler(env_name=args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers, args=args) # NOTE Observation space is a list with [predator0, predator1, ..., prey] # Thus using the index of 0 policy = NormalMLPPolicy( input_size=int(np.prod(sampler.envs.observation_space[0].shape)), output_size=int(np.prod(sampler.envs.action_space[0].shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( input_size=int(np.prod(sampler.envs.observation_space[0].shape))) meta_learner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) # meta_learner.load( # filename="theta_200", directory="./pytorch_models") meta_tester = MetaTester(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device, args=args, log=log, tb_writer=tb_writer) prey = Prey(env=sampler._env, args=args, log=log, tb_writer=tb_writer, name="prey", i_agent=0) # Meta-train starts iteration = 0 while True: # Sample train and validation episode tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, test=False) episodes = meta_learner.sample(tasks, prey, first_order=args.first_order, iteration=iteration) # Train meta-policy meta_learner.step(episodes=episodes, args=args) # Test meta-policy if iteration % 10 == 0: test_tasks = sampler.sample_tasks(num_tasks=5, test=True) meta_tester.few_shot_adaptation(meta_policy=meta_learner.policy, tasks=test_tasks, first_order=args.first_order, iteration=iteration, prey=prey) if iteration % 100 == 0: meta_learner.save(iteration) iteration += 1
def train_pretrained_model(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_pretrained')) save_folder = './saves/{0}'.format(args.output_folder + '_pretrained') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) #batch_size=2*args.fast_batch_size to match the amount of data used in meta-learning sampler = BatchSampler(args.env_name, batch_size=2 * args.fast_batch_size, num_workers=args.num_workers) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: pretrained_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(pretrained_model): policy.load_state_dict(torch.load(pretrained_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample_for_pretraining( tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() params = metalearner.adapt(episodes, first_order=args.first_order) metalearner.policy.load_state_dict(params, strict=True) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) # Tensorboard # writer.add_scalar('total_rewards/before_update', # total_rewards([ep.rewards for ep, _ in episodes]), batch) # writer.add_scalar('total_rewards/after_update', # total_rewards([ep.rewards for _, ep in episodes]), batch) # experiment.log_metric("Avg Disc Reward (Pretrained)", total_rewards([episodes.rewards], args.gamma), batch+1) # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) return
def train_meta_learning_model(args): # import matplotlib.pyplot as plt # import matplotlib.animation as animation # from matplotlib import style # style.use('fivethirtyeight') # fig = plt.figure() # ax1 = fig.add_subplot(1,1,1) # xs = [] # ys = [] # def animate(i): # ax1.clear() # ax1.plot(xs, ys) rewards_before_ml = [] rewards_after_ml = [] continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'MountainCarContinuousVT-v0' ]) # writer = SummaryWriter('./logs/{0}'.format(args.output_folder + '_metalearned')) save_folder = './saves/{0}'.format(args.output_folder + '_metalearned') if not os.path.exists(save_folder): os.makedirs(save_folder) with open(os.path.join(save_folder, 'config.json'), 'w') as f: config = {k: v for (k, v) in vars(args).items() if k != 'device'} config.update(device=args.device.type) json.dump(config, f, indent=2) sampler = BatchSampler(args.env_name, batch_size=args.fast_batch_size, num_workers=args.num_workers) torch.manual_seed(args.random_seed) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) else: policy = CategoricalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), sampler.envs.action_space.n, hidden_sizes=(args.hidden_size, ) * args.num_layers) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) #load pretrained model cont_from_batch = 0 if args.start_from_batch != -1: metalearned_model = os.path.join( save_folder, 'policy-{0}.pt'.format(args.start_from_batch - 1)) if os.path.exists(metalearned_model): policy.load_state_dict(torch.load(metalearned_model)) cont_from_batch = args.start_from_batch metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) for batch in range(cont_from_batch, args.num_batches): print('Currently processing Batch: {}'.format(batch + 1)) task_sampling_time = time.time() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size, sampling_type=args.sampling_type, points_per_dim=args.points_per_dim) task_sampling_time = time.time() - task_sampling_time episode_generating_time = time.time() episodes = metalearner.sample(tasks, first_order=args.first_order) episode_generating_time = time.time() - episode_generating_time learning_step_time = time.time() metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) learning_step_time = time.time() - learning_step_time print('Tasking Sampling Time: {}'.format(task_sampling_time)) print('Episode Generating Time: {}'.format(episode_generating_time)) print('Learning Step Time: {}'.format(learning_step_time)) reward_before_ml = total_rewards([ep.rewards for ep, _ in episodes], args.gamma) reward_after_ml = total_rewards([ep.rewards for _, ep in episodes], args.gamma) print('Before Update: {} After Update: {}'.format( reward_before_ml, reward_after_ml)) # experiment.log_metric("Avg Reward Before Update (MetaLearned)", reward_before_ml) experiment.log_metric("Avg Reward", reward_after_ml, batch + 1) rewards_before_ml.append(reward_before_ml) rewards_after_ml.append(reward_after_ml) # xs.append(batch+1) # ys.append(total_rewards([ep.rewards for _, ep in episodes], args.gamma)) # ani = animation.FuncAnimation(fig, animate, interval=1000) # plt.savefig('navg_baseline_monitor') # Save policy network with open(os.path.join(save_folder, 'policy-{0}.pt'.format(batch)), 'wb') as f: torch.save(metalearner.policy.state_dict(), f) # tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) # episodes = metalearner.sample(tasks, first_order=args.first_order) # print("Avg Reward After Update (MetaLearned)", total_rewards([ep.rewards for _, ep in episodes], args.gamma)) testing_sampler = BatchSampler(args.env_name, batch_size=args.testing_fbs, num_workers=args.num_workers) testing_metalearner = MetaLearner(testing_sampler, metalearner.policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) test_tasks = testing_sampler.sample_tasks(num_tasks=args.testing_mbs, sampling_type='rand', points_per_dim=-1) test_episodes = testing_metalearner.sample(test_tasks, first_order=args.first_order, no_update=True) test_reward = total_rewards([ep.rewards for ep in test_episodes], args.gamma) print('-------------------------------------------------') print('Test Time reward is: ' + str(test_reward)) print('-------------------------------------------------') pickle_reward_data_file = os.path.join(save_folder, 'reward_data.pkl') with open(pickle_reward_data_file, 'wb') as f: pickle.dump(rewards_before_ml, f) pickle.dump(rewards_after_ml, f) pickle_final_reward_file = os.path.join(save_folder, 'final_reward.pkl') with open(pickle_final_reward_file, 'wb') as f: pickle.dump(test_reward, f) return
def main(args): continuous_actions = (args.env_name in [ 'AntVel-v1', 'AntDir-v1', 'AntPos-v0', 'HalfCheetahVel-v1', 'HalfCheetahDir-v1', '2DNavigation-v0', 'Point2DWalls-corner-v0', 'Ant-v0', 'HalfCheetah-v0' ]) logger.configure(dir=args.log_dir, format_strs=['stdout', 'log', 'csv']) logger.log(args) json.dump(vars(args), open(os.path.join( args.log_dir, 'params.json', ), 'w'), indent=2) sampler = BatchSamplerMultiworld(args) sampler_val = BatchSamplerMultiworld(args, val=True) if continuous_actions: policy = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers, bias_transformation_size=args.bias_transformation_size, init_gain=args.init_gain, ) else: raise NotImplementedError baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, policy, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, entropy_coef=args.entropy_coef, device=args.device) start_time = time.time() processes = [] for batch in range(args.num_batches): metalearner.reset() tasks = sampler.sample_tasks(num_tasks=args.meta_batch_size) episodes = metalearner.sample(tasks, first_order=args.first_order) if sampler.rewarder.fit_counter > 0: metalearner.step(episodes, max_kl=args.max_kl, cg_iters=args.cg_iters, cg_damping=args.cg_damping, ls_max_steps=args.ls_max_steps, ls_backtrack_ratio=args.ls_backtrack_ratio) if batch % args.rewarder_fit_period == 0: sampler.fit_rewarder(logger) if args.rewarder == 'unsupervised': sampler.log_unsupervised(logger) log_main(logger, episodes, batch, args, start_time, metalearner) if batch % args.save_period == 0 or batch == args.num_batches - 1: save_model_maml(args, policy, batch) if batch % args.val_period == 0 or batch == args.num_batches - 1: val(args, sampler_val, policy, baseline, batch) if batch % args.vis_period == 0 or batch == args.num_batches - 1: if args.plot: p = Popen( 'python maml_rl/utils/visualize.py --log-dir {}'.format( args.log_dir), shell=True) processes.append(p) logger.dumpkvs()
num_workers=args.num_workers) model = NormalMLPPolicy( int(np.prod(sampler.envs.observation_space.shape)), int(np.prod(sampler.envs.action_space.shape)), hidden_sizes=(args.hidden_size, ) * args.num_layers) checkpoint = torch.load( '../final_models/meta/{0}/policy-{1}.pt'.format( args.to_pickle, index)) model.load_state_dict(checkpoint) baseline = LinearFeatureBaseline( int(np.prod(sampler.envs.observation_space.shape))) metalearner = MetaLearner(sampler, model, baseline, gamma=args.gamma, fast_lr=args.fast_lr, tau=args.tau, device=args.device) task_success = [] for _ in range(buckets): tasks = env.unwrapped.sample_tasks(num_test_tasks) success = 0 #times = [] metalearner = gradient_step(0, tasks, args) for task in tasks: s = env.reset_task(task) step = 0 d = False while not d: