def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = SoftActorCritic( env=env, training_env=training_env, policy=policy, qf=qf, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning GAIL train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError('Not implemented pixel version of things!') else: obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the policy and policy optimizer hidden_sizes = [variant['algo_params']['policy_net_size']] * variant['algo_params']['policy_num_layers'] z_dim = variant['algo_params']['np_params']['z_dim'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # disc_model = GAILDiscModel(obs_dim + action_dim + z_dim, hid_dim=variant['algo_params']['disc_net_size']) disc_model = MlpGAILDisc( hidden_sizes=variant['disc_hidden_sizes'], output_size=1, input_size=obs_dim + action_dim + z_dim, hidden_activation=torch.nn.functional.tanh, layer_norm=variant['disc_uses_layer_norm'] # output_activation=identity, ) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']['policy_params'] ) # Make the neural process # in the initial version we are assuming all trajectories have the same length timestep_enc_params = variant['algo_params']['np_params']['traj_enc_params']['timestep_enc_params'] traj_enc_params = variant['algo_params']['np_params']['traj_enc_params']['traj_enc_params'] timestep_enc_params['input_size'] = obs_dim + action_dim traj_samples, _ = train_context_buffer.sample_trajs(1, num_tasks=1) # len_context_traj = traj_samples[0][0]['observations'].shape[0] len_context_traj = 5 traj_enc_params['input_size'] = timestep_enc_params['output_size'] * len_context_traj traj_enc = TrivialTrajEncoder( timestep_enc_params, traj_enc_params ) trunk_params = variant['algo_params']['np_params']['r2z_map_params']['trunk_params'] trunk_params['input_size'] = traj_enc.output_size split_params = variant['algo_params']['np_params']['r2z_map_params']['split_heads_params'] split_params['input_size'] = trunk_params['output_size'] split_params['output_size'] = variant['algo_params']['np_params']['z_dim'] r2z_map = TrivialR2ZMap( trunk_params, split_params ) np_enc = TrivialNPEncoder( variant['algo_params']['np_params']['np_enc_params']['agg_type'], traj_enc, r2z_map ) # class StupidDistFormat(): # def __init__(self, var): # self.mean = var # class ZeroModule(nn.Module): # def __init__(self, z_dim): # super().__init__() # self.z_dim = z_dim # self.fc = nn.Linear(10,10) # def forward(self, context): # c_len = len(context) # return StupidDistFormat(Variable(torch.zeros(c_len, self.z_dim), requires_grad=False)) # np_enc = ZeroModule(variant['algo_params']['np_params']['z_dim']) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) algorithm = NeuralProcessAIRL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, np_enc, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning airl train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # load the expert expert_policy = joblib.load(variant['expert_policy'])['algorithm'] expert_policy.replay_buffer = None # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] meta_train_env = ScaledMetaEnv( meta_train_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) meta_test_env = ScaledMetaEnv( meta_test_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) print(meta_train_env) print(meta_test_env) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # Make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg']) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = MetaDagger( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, expert_policy, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params']) for task_id in train_context_buffer.task_replay_buffers: erb = train_context_buffer.task_replay_buffers[task_id] rb = algorithm.replay_buffer.task_replay_buffers[task_id] erb_size = erb._size print(erb_size) for k in erb._observations: rb._observations[k][:erb_size] = erb._observations[k][:erb_size] rb._next_obs[k][:erb_size] = erb._next_obs[k][:erb_size] rb._actions[:erb_size] = erb._actions[:erb_size] rb._rewards[:erb_size] = erb._rewards[:erb_size] rb._terminals[:erb_size] = erb._terminals[:erb_size] rb._absorbing[:erb_size] = erb._absorbing[:erb_size] rb._size = erb_size rb._top = erb_size # print('\n\n') # for task_id in algorithm.replay_buffer.task_replay_buffers: # rb = algorithm.replay_buffer.task_replay_buffers[task_id] # print(rb._size) # print(rb._top) # print(rb._max_replay_buffer_size) if ptu.gpu_enabled(): expert_policy.cuda() algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning AIRL train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError('Not implemented pixel version of things!') else: obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model if variant['algo_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n') if variant['algo_params']['transfer_version']: disc_model = TransferVersionSingleColorFetchCustomDisc( clamp_magnitude=variant['disc_clamp_magnitude'], z_dim=variant['algo_params']['np_params']['z_dim'], gamma=0.99 ) else: disc_model = ThirdVersionSingleColorFetchCustomDisc( clamp_magnitude=variant['disc_clamp_magnitude'], state_only=variant['algo_params']['state_only'], wrap_absorbing=variant['algo_params']['wrap_absorbing'], z_dim=variant['algo_params']['np_params']['z_dim'] ) if variant['algo_params']['use_target_disc']: target_disc = disc_model.copy() else: target_disc = None print(disc_model) print(disc_model.clamp_magnitude) z_dim = variant['algo_params']['np_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = ObsPreprocessedQFunc( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, input_size=6 + 4 + 4 + 1*variant['algo_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing'] ) qf2 = ObsPreprocessedQFunc( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, input_size=6 + 4 + 4 + 1*variant['algo_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing'] ) vf = ObsPreprocessedVFunc( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, input_size=6 + 4 + 1*variant['algo_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing'] ) policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy( target_disc.obs_processor if target_disc is not None else disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, obs_dim=6 + 4, action_dim=4 ) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, wrap_absorbing=variant['algo_params']['wrap_absorbing'], **variant['policy_params'] ) # Make the neural process traj_enc = TrivialTrajEncoder(state_only=variant['algo_params']['state_only']) context_enc = TrivialContextEncoder( variant['algo_params']['np_params']['agg_type'], traj_enc, state_only=variant['algo_params']['state_only'] ) r2z_map = TrivialR2ZMap(z_dim) np_enc = TrivialNPEncoder( context_enc, r2z_map, state_only=variant['algo_params']['state_only'] ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) if variant['meta_fairl']: print('\n\nUSING META-FAIRL\n\n') algorithm_class = MetaFAIRL else: print('\n\nUSING META-AIRL\n\n') algorithm_class = NeuralProcessAIRL algorithm = algorithm_class( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, np_enc, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, target_disc=target_disc, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() # print(target_disc) # print(next(algorithm.discriminator.obs_processor.parameters()).is_cuda) # print(next(algorithm.main_policy.preprocess_model.parameters()).is_cuda) # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.copy().preprocess_model) # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.preprocess_model.copy()) # 1/0 algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning AIRL train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model if variant['algo_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n') assert 'transfer_version' not in variant['algo_params'] if variant['algo_params']['only_Dc'] or variant['algo_params'][ 'disc_ignores_z']: disc_model = OnlyDcTFuncForFetch( T_clamp_magnitude=variant['T_clamp_magnitude'], gating_clamp_magnitude=variant['gating_clamp_magnitude'], state_only=variant['algo_params']['state_only'], wrap_absorbing=variant['algo_params']['wrap_absorbing'], D_c_repr_dim=variant['algo_params']['D_c_repr_dim'], ) else: disc_model = TFuncForFetch( T_clamp_magnitude=variant['T_clamp_magnitude'], gating_clamp_magnitude=variant['gating_clamp_magnitude'], state_only=variant['algo_params']['state_only'], wrap_absorbing=variant['algo_params']['wrap_absorbing'], D_c_repr_dim=variant['algo_params']['D_c_repr_dim'], z_dim=variant['algo_params']['np_params']['z_dim']) if variant['algo_params']['use_target_disc']: target_disc = disc_model.copy() else: target_disc = None print(disc_model) print(disc_model.T_clamp_magnitude) print(disc_model.gating_clamp_magnitude) z_dim = variant['algo_params']['np_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] # make the policy and its obs gating model if not variant['algo_params']['only_Dc']: latent_dim = z_dim else: latent_dim = variant['algo_params']['D_c_repr_dim'] if variant['algo_params']['use_disc_obs_processor'] and variant[ 'algo_params']['only_Dc']: assert variant['algo_params']['only_Dc'] print('\n\nUSING DISC OBS PROCESSOR\n\n') policy_obs_gating = disc_model.D_c_repr_obs_processor policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy( policy_obs_gating, latent_dim, train_preprocess_model=False, hidden_sizes=hidden_sizes, obs_dim=6 + 4, action_dim=4) else: # print('\n\n$$$$$$$$\nNO BN IN POL GATING\n$$$$$$$$$\n\n') policy_obs_gating = ObsGating(variant['gating_clamp_magnitude'], z_dim=latent_dim) policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy( policy_obs_gating, latent_dim, train_preprocess_model=True, hidden_sizes=hidden_sizes, obs_dim=6 + 4, action_dim=4) print(policy) qf1 = ObsPreprocessedQFunc( policy.preprocess_model, latent_dim, hidden_sizes=hidden_sizes, input_size=6 + 4 + 4 + 1 * variant['algo_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing']) qf2 = ObsPreprocessedQFunc( policy.preprocess_model, latent_dim, hidden_sizes=hidden_sizes, input_size=6 + 4 + 4 + 1 * variant['algo_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing']) vf = ObsPreprocessedVFunc( policy.preprocess_model, latent_dim, hidden_sizes=hidden_sizes, input_size=6 + 4 + 1 * variant['algo_params']['wrap_absorbing'], output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing']) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, wrap_absorbing=variant['algo_params']['wrap_absorbing'], **variant['policy_params']) # make the context encoder for the discrimiantor traj_enc = TrivialTrajEncoder( state_only=variant['algo_params']['state_only']) disc_r_getter = TrivialContextEncoder( variant['algo_params']['np_params']['agg_type'], traj_enc, state_only=variant['algo_params']['state_only']) disc_encoder = TrivialDiscDcEncoder(disc_r_getter, variant['algo_params']['D_c_repr_dim']) # make the amortized q distribution if variant['algo_params']['q_uses_disc_r_getter']: r2z_map = TrivialR2ZMap(z_dim) q_model = TrivialNPEncoder(disc_r_getter, r2z_map, train_context_encoder=False) else: traj_enc = TrivialTrajEncoder( state_only=variant['algo_params']['state_only']) r2z_map = TrivialR2ZMap(z_dim) q_context_encoder = TrivialContextEncoder( variant['algo_params']['np_params']['agg_type'], traj_enc, state_only=variant['algo_params']['state_only']) q_model = TrivialNPEncoder(q_context_encoder, r2z_map, train_context_encoder=True) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = NeuralProcessMetaIRL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, disc_encoder, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, q_model, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, target_disc=target_disc, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() # print(target_disc) # print(next(algorithm.discriminator.obs_processor.parameters()).is_cuda) # print(next(algorithm.main_policy.preprocess_model.parameters()).is_cuda) # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.copy().preprocess_model) # print(algorithm.main_policy.preprocess_model is algorithm.main_policy.preprocess_model.copy()) # 1/0 algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning airl train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] meta_train_env = ScaledMetaEnv( meta_train_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) meta_test_env = ScaledMetaEnv( meta_test_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) print(meta_train_env) print(meta_test_env) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError('Not implemented pixel version of things!') else: obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # Make the encoder encoder = TimestepBasedEncoder( 2*obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg'] ) # --------------- # encoder = WeightShareTimestepBasedEncoder( # obs_dim, # action_dim, # 64, # variant['algo_params']['r_dim'], # variant['algo_params']['z_dim'], # variant['algo_params']['enc_hid_dim'], # variant['algo_params']['r2z_hid_dim'], # variant['algo_params']['num_enc_layer_blocks'], # hid_act='relu', # use_bn=True, # within_traj_agg=variant['algo_params']['within_traj_agg'] # ) # --------------- # traj_enc = ConvTrajEncoder( # variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'], # # obs_dim + action_dim, # obs_dim + action_dim + obs_dim, # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['traj_enc_params']['kernel'], # variant['algo_params']['np_params']['traj_enc_params']['stride'], # ) # Dc2R_map = Dc2RMap( # variant['algo_params']['np_params']['Dc2r_params']['agg_type'], # traj_enc, # state_only=False # ) # r2z_map = R2ZMap( # variant['algo_params']['np_params']['r2z_params']['num_layers'], # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['r2z_params']['hid_dim'], # variant['algo_params']['z_dim'] # ) # encoder = NPEncoder( # Dc2R_map, # r2z_map, # ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) algorithm = NeuralProcessBC( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(specs): # this is just bad nomenclature: specific_exp_dir is the dir where you will find # the specific experiment run (with a particular seed etc.) of the expert policy # to use for generating trajectories if not specs['use_scripted_policy']: policy_is_scripted = False expert = joblib.load(path.join(specs['expert_dir'], 'extra_data.pkl'))['algorithm'] # max_path_length = expert.max_path_length max_path_length = specs['max_path_length'] if max_path_length != expert.max_path_length: print('\n\nUsing max_path_length {}! Expert\'s was {}!'.format( max_path_length, expert.max_path_length)) attrs = [ 'max_path_length', 'policy_uses_pixels', 'policy_uses_task_params', 'no_terminal' ] expert_policy_specs = {att: getattr(expert, att) for att in attrs} expert_policy_specs['wrap_absorbing'] = specs['wrap_absorbing'] no_terminal = specs['no_terminal'] else: policy_is_scripted = True max_path_length = specs['max_path_length'] wrap_absorbing = specs['wrap_absorbing'] expert_policy_specs = { 'policy_uses_pixels': specs['policy_uses_pixels'], 'policy_uses_task_params': specs['policy_uses_task_params'], 'concat_task_params_to_policy_obs': specs['concat_task_params_to_policy_obs'] } no_terminal = specs['no_terminal'] expert = get_scripted_policy(specs['scripted_policy_name']) # set up the envs env_specs = specs['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) # get the task param iterators for the meta envs meta_train_params_sampler, meta_test_params_sampler = get_meta_env_params_iters( env_specs) # make the replay buffers if specs['wrap_absorbing']: _max_buffer_size = (max_path_length + 2) * specs['num_rollouts_per_task'] else: _max_buffer_size = max_path_length * specs['num_rollouts_per_task'] _max_buffer_size = int( np.ceil(_max_buffer_size / float(specs['subsample_factor']))) + 10 # + 10 is just in case somewhere someone uses ._size of replay buffers incorrectly buffer_constructor = lambda env_for_buffer: MetaEnvReplayBuffer( _max_buffer_size, env_for_buffer, policy_uses_pixels=specs['student_policy_uses_pixels'], # we don't want the student policy to be looking at true task parameters policy_uses_task_params=False, concat_task_params_to_policy_obs=False) train_context_buffer = buffer_constructor(meta_train_env) test_context_buffer = buffer_constructor(meta_test_env) render = specs['render'] check_for_success = specs['check_for_success'] # fill the train buffers fill_buffer(train_context_buffer, meta_train_env, expert, expert_policy_specs, meta_train_params_sampler, specs['num_rollouts_per_task'], max_path_length, no_terminal=no_terminal, wrap_absorbing=specs['wrap_absorbing'], policy_is_scripted=policy_is_scripted, render=render, check_for_success=check_for_success, subsample_factor=specs['subsample_factor'], deterministic=specs['get_deterministic_expert_demos']) train_test_buffer = deepcopy(train_context_buffer) # fill the test buffers fill_buffer(test_context_buffer, meta_train_env, expert, expert_policy_specs, meta_test_params_sampler, specs['num_rollouts_per_task'], max_path_length, no_terminal=no_terminal, wrap_absorbing=specs['wrap_absorbing'], policy_is_scripted=policy_is_scripted, render=render, check_for_success=check_for_success, subsample_factor=specs['subsample_factor'], deterministic=specs['get_deterministic_expert_demos']) test_test_buffer = deepcopy(test_context_buffer) # save the replay buffers d = { 'meta_train': { 'context': train_context_buffer, 'test': train_test_buffer }, 'meta_test': { 'context': test_context_buffer, 'test': test_test_buffer } } logger.save_extra_data(d) return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning GAIL train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) # student policy should not have access to any task information print(variant['algo_params'].keys()) meta_train_env.policy_uses_pixels = variant['algo_params'][ 'policy_uses_pixels'] meta_train_env.policy_uses_task_params = False meta_train_env.concat_task_params_to_policy_obs = False meta_test_env.policy_uses_pixels = variant['algo_params'][ 'policy_uses_pixels'] meta_test_env.policy_uses_task_params = False meta_test_env.concat_task_params_to_policy_obs = False # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) policy_net_size = variant['algo_params']['policy_net_size'] policy_num_layers = variant['algo_params']['policy_num_layers'] hidden_sizes = [policy_net_size] * policy_num_layers # policy = MlpPolicy( # [policy_net_size, policy_net_size], # action_dim, # obs_dim + variant['algo_params']['np_params']['z_dim'], # hidden_activation=torch.nn.functional.tanh, # layer_norm=variant['algo_params']['use_layer_norm'] # ) policy = MlpPolicy( hidden_sizes, action_dim, obs_dim + variant['algo_params']['np_params']['z_dim'], # hidden_activation=torch.nn.functional.relu, hidden_activation=torch.nn.functional.tanh, output_activation=torch.nn.functional.tanh, layer_norm=variant['algo_params']['use_layer_norm'] # batch_norm=True ) # Make the neural process # in the initial version we are assuming all trajectories have the same length timestep_enc_params = variant['algo_params']['np_params'][ 'traj_enc_params']['timestep_enc_params'] traj_enc_params = variant['algo_params']['np_params']['traj_enc_params'][ 'traj_enc_params'] timestep_enc_params['input_size'] = obs_dim + action_dim traj_samples, _ = train_context_buffer.sample_trajs(1, num_tasks=1) len_context_traj = traj_samples[0][0]['observations'].shape[0] len_context_traj = 5 traj_enc_params[ 'input_size'] = timestep_enc_params['output_size'] * len_context_traj traj_enc = TrivialTrajEncoder(timestep_enc_params, traj_enc_params) trunk_params = variant['algo_params']['np_params']['r2z_map_params'][ 'trunk_params'] trunk_params['input_size'] = traj_enc.output_size split_params = variant['algo_params']['np_params']['r2z_map_params'][ 'split_heads_params'] split_params['input_size'] = trunk_params['output_size'] split_params['output_size'] = variant['algo_params']['np_params']['z_dim'] r2z_map = TrivialR2ZMap(trunk_params, split_params) np_enc = TrivialNPEncoder( variant['algo_params']['np_params']['np_enc_params']['agg_type'], traj_enc, r2z_map) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = NeuralProcessBC( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, np_enc, train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, training_env=meta_train_env, # the env used for generating trajectories **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning airl train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError('Not implemented pixel version of things!') else: obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model z_dim = variant['algo_params']['np_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] obs_processor = ObsGatingV1( clamp_magnitude=variant['gate_logit_clamp_magnitude'], z_dim=z_dim ) policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy( obs_processor, z_dim, hidden_sizes=hidden_sizes, obs_dim=6 + 4, action_dim=4, train_preprocess_model=True ) # Make the neural process traj_enc = TrivialTrajEncoder(state_only=variant['algo_params']['state_only']) context_enc = TrivialContextEncoder( variant['algo_params']['np_params']['agg_type'], traj_enc, state_only=variant['algo_params']['state_only'] ) r2z_map = TrivialR2ZMap(z_dim) np_enc = TrivialNPEncoder( context_enc, r2z_map, state_only=variant['algo_params']['state_only'] ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) algorithm = NeuralProcessBC( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, np_enc, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): env_specs = variant['env_specs'] if variant['algo_params']['meta']: env, training_env = get_meta_env(env_specs) else: if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) if variant['algo_params']['meta']: train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) print(env.observation_space) if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: obs_dim += int( np.prod( env.observation_space.spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] hidden_sizes = [net_size] * variant['num_hidden_layers'] if variant['use_custom_ant_models']: assert isinstance(env.observation_space, Dict) print('CUSTOM ANT WITH LINEAR EMBEDDING OF THE TARGET POSITION') qf1 = AntRandGoalCustomQFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)) + action_dim, output_size=1, ) qf2 = AntRandGoalCustomQFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)) + action_dim, output_size=1, ) vf = AntRandGoalCustomVFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)), output_size=1, ) policy = AntRandGoalCustomReparamTanhMultivariateGaussianPolicy( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, obs_dim=int(np.prod(env.observation_space.spaces['obs'].shape)), action_dim=action_dim, ) # CUSTOM ANT WITH GATING ACTIVATIONS OF EACH LAYER # qf1 = AntCustomGatingQFuncV1() # qf2 = AntCustomGatingQFuncV1() # vf = AntCustomGatingVFuncV1() # policy = AntCustomGatingV1ReparamTanhMultivariateGaussianPolicy() else: print('Using simple model') qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) if variant['algo_params']['meta']: algorithm = MetaNewSoftActorCritic( env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, true_env_obs_dim=int( np.prod(env.observation_space.spaces['obs'].shape)), **variant['algo_params']) else: algorithm = NewSoftActorCritic(env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] meta_train_env = ScaledMetaEnv( meta_train_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) meta_test_env = ScaledMetaEnv( meta_test_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model if variant['algo_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n') if variant['algo_params']['state_only']: raise NotImplementedError() disc_model = AntLinClassDisc( obs_dim - 12 + 2 + action_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude'], z_dim=variant['algo_params']['z_dim']) print(disc_model) print(disc_model.clamp_magnitude) if variant['algo_params']['use_target_disc']: target_disc = disc_model.copy() else: target_disc = None print(disc_model) print(disc_model.clamp_magnitude) z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = ObsPreprocessedQFunc( disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, input_size=obs_dim - 12 + 2 + action_dim, output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing']) qf2 = ObsPreprocessedQFunc( disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, input_size=obs_dim - 12 + 2 + action_dim, output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing']) vf = ObsPreprocessedVFunc( disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, input_size=obs_dim - 12 + 2, output_size=1, wrap_absorbing=variant['algo_params']['wrap_absorbing']) policy = WithZObsPreprocessedReparamTanhMultivariateGaussianPolicy( disc_model.obs_processor, z_dim, hidden_sizes=hidden_sizes, obs_dim=obs_dim - 12 + 2, action_dim=action_dim) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, wrap_absorbing=variant['algo_params']['wrap_absorbing'], **variant['policy_params']) # make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim if variant['algo_params']['state_only'] else 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg'], state_only=variant['algo_params']['state_only']) # --------------- # traj_enc = ConvTrajEncoder( # variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'], # obs_dim + action_dim if not variant['algo_params']['state_only'] else obs_dim, # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['traj_enc_params']['kernel'], # variant['algo_params']['np_params']['traj_enc_params']['stride'], # ) # Dc2R_map = Dc2RMap( # variant['algo_params']['np_params']['Dc2r_params']['agg_type'], # traj_enc, # state_only=variant['algo_params']['state_only'] # ) # r2z_map = R2ZMap( # variant['algo_params']['np_params']['r2z_params']['num_layers'], # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['r2z_params']['hid_dim'], # variant['algo_params']['z_dim'] # ) # encoder = NPEncoder( # Dc2R_map, # r2z_map, # ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = NeuralProcessAIRL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, target_disc=target_disc, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
# env_specs = { # 'base_env_name': 'meta_simple_meta_reacher', # 'normalized': False # } env_specs = { 'base_env_name': 'meta_simple_meta_reacher', 'normalized': False, 'need_pixels': True, 'render_kwargs': { 'height': 64, 'width': 64, 'camera_id': 0 } } meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_params_sampler, meta_test_params_sampler = get_meta_env_params_iters( env_specs) buffer = d['meta_train']['context'] buffer.policy_uses_pixels = True task_params, obs_task_params = meta_train_params_sampler.sample() meta_train_env.reset(task_params=task_params, obs_task_params=obs_task_params) task_id = meta_train_env.task_identifier # print(buffer.num_steps_can_sample()) # print(buffer.task_replay_buffers.keys())
def experiment(variant): # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg'], state_only=False) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = PEARL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, qf1, qf2, vf, encoder, # z_dim, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning AIRL train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model if variant['algo_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n') disc_model = StandardMetaDisc( 2 * obs_dim + action_dim + variant['algo_params']['z_dim'] if not variant['algo_params']['state_only'] else 2 * obs_dim + variant['algo_params']['z_dim'], num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) if variant['algo_params']['use_target_disc']: target_disc = disc_model.copy() else: target_disc = None print(disc_model) print(disc_model.clamp_magnitude) z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, wrap_absorbing=variant['algo_params']['wrap_absorbing'], **variant['policy_params']) # make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = MetaFAIRL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, target_disc=target_disc, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning airl train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) # set up the policy and training algorithm obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model z_dim = variant['algo_params']['z_dim'] # make the MLP hidden_sizes = [variant['algo_params']['mlp_hid_dim']] * variant['algo_params']['mlp_layers'] obs_task_params_dim = int(np.prod(meta_train_env.observation_space.spaces['obs_task_params'].shape)) mlp = Mlp( hidden_sizes, output_size=obs_task_params_dim if variant['algo_params']['training_regression'] else 1, input_size=z_dim if variant['algo_params']['training_regression'] else z_dim + 2*obs_task_params_dim, batch_norm=variant['algo_params']['mlp_use_bn'] ) # Make the encoder encoder = TimestepBasedEncoder( 2*obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg'] ) # --------------- # encoder = WeightShareTimestepBasedEncoder( # obs_dim, # action_dim, # 64, # variant['algo_params']['r_dim'], # variant['algo_params']['z_dim'], # variant['algo_params']['enc_hid_dim'], # variant['algo_params']['r2z_hid_dim'], # variant['algo_params']['num_enc_layer_blocks'], # hid_act='relu', # use_bn=True, # within_traj_agg=variant['algo_params']['within_traj_agg'] # ) # --------------- # traj_enc = ConvTrajEncoder( # variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'], # # obs_dim + action_dim, # obs_dim + action_dim + obs_dim, # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['traj_enc_params']['kernel'], # variant['algo_params']['np_params']['traj_enc_params']['stride'], # ) # Dc2R_map = Dc2RMap( # variant['algo_params']['np_params']['Dc2r_params']['agg_type'], # traj_enc, # state_only=False # ) # r2z_map = R2ZMap( # variant['algo_params']['np_params']['r2z_params']['num_layers'], # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['r2z_params']['hid_dim'], # variant['algo_params']['z_dim'] # ) # encoder = NPEncoder( # Dc2R_map, # r2z_map, # ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) algorithm = UpperBound( meta_train_env, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, mlp, encoder, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1