def experiment(variant): task_mode = variant['task_mode'] # train, test, eval task_idx = variant['task_idx'] if task_mode == 'train': task_sampler = WalkerTrainParamsSampler() elif task_mode == 'test': task_sampler = WalkerTestParamsSampler() else: raise NotImplementedError() task_params = task_sampler.get_task(task_idx) obs_task_params = task_sampler.get_obs_task_params(task_params) env = SingleTaskWalkerEnv(task_params, obs_task_params) training_env = SingleTaskWalkerEnv(task_params, obs_task_params) print(env.observation_space) obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] hidden_sizes = [net_size] * variant['num_hidden_layers'] print('Using simple model') qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) algorithm = NewSoftActorCritic( env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning airl train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # load the expert expert_policy = joblib.load(variant['expert_policy'])['algorithm'] expert_policy.replay_buffer = None # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] meta_train_env = ScaledMetaEnv( meta_train_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) meta_test_env = ScaledMetaEnv( meta_test_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) print(meta_train_env) print(meta_test_env) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # Make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg']) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = MetaDagger( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, expert_policy, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params']) for task_id in train_context_buffer.task_replay_buffers: erb = train_context_buffer.task_replay_buffers[task_id] rb = algorithm.replay_buffer.task_replay_buffers[task_id] erb_size = erb._size print(erb_size) for k in erb._observations: rb._observations[k][:erb_size] = erb._observations[k][:erb_size] rb._next_obs[k][:erb_size] = erb._next_obs[k][:erb_size] rb._actions[:erb_size] = erb._actions[:erb_size] rb._rewards[:erb_size] = erb._rewards[:erb_size] rb._terminals[:erb_size] = erb._terminals[:erb_size] rb._absorbing[:erb_size] = erb._absorbing[:erb_size] rb._size = erb_size rb._top = erb_size # print('\n\n') # for task_id in algorithm.replay_buffer.task_replay_buffers: # rb = algorithm.replay_buffer.task_replay_buffers[task_id] # print(rb._size) # print(rb._top) # print(rb._max_replay_buffer_size) if ptu.gpu_enabled(): expert_policy.cuda() algorithm.cuda() algorithm.train() return 1
def experiment(variant): # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_specs_vg = VariantGenerator() env_spec_constants = {} env_spec_ranges = {} for k, v in env_specs.items(): if isinstance(v, list): env_specs_vg.add(k, v) env_spec_ranges[k] = v else: env_spec_constants[k] = v env_specs_list = [] for es in env_specs_vg.variants(): del es['_hidden_keys'] es.update(env_spec_constants) env_specs_list.append(es) env_sampler = EnvSampler(env_specs_list) # make the normalizer function for the env_params mean = [] half_diff = [] for k in sorted(env_spec_ranges.keys()): r = env_spec_ranges[k] if len(r) == 1: mean.append(0) half_diff.append(r[0]) else: mean.append((r[0] + r[1]) / 2.0) half_diff.append((r[1] - r[0]) / 2.0) mean = np.array(mean) half_diff = np.array(half_diff) def env_params_normalizer(params): return (params - mean) / half_diff variant['algo_params']['env_params_normalizer'] = env_params_normalizer # set up similar to non-meta version sample_env, _ = env_sampler() if variant['algo_params']['concat_env_params_to_obs']: meta_params_dim = sample_env.env_meta_params.shape[0] else: meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=1, ) if exp_specs['use_new_sac']: qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) algorithm = NewMetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) else: policy = TanhGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) qf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) algorithm = MetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf=qf, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning airl train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] meta_train_env = ScaledMetaEnv( meta_train_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) meta_test_env = ScaledMetaEnv( meta_test_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) print(meta_train_env) print(meta_test_env) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError('Not implemented pixel version of things!') else: obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # Make the encoder encoder = TimestepBasedEncoder( 2*obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg'] ) # --------------- # encoder = WeightShareTimestepBasedEncoder( # obs_dim, # action_dim, # 64, # variant['algo_params']['r_dim'], # variant['algo_params']['z_dim'], # variant['algo_params']['enc_hid_dim'], # variant['algo_params']['r2z_hid_dim'], # variant['algo_params']['num_enc_layer_blocks'], # hid_act='relu', # use_bn=True, # within_traj_agg=variant['algo_params']['within_traj_agg'] # ) # --------------- # traj_enc = ConvTrajEncoder( # variant['algo_params']['np_params']['traj_enc_params']['num_conv_layers'], # # obs_dim + action_dim, # obs_dim + action_dim + obs_dim, # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['traj_enc_params']['kernel'], # variant['algo_params']['np_params']['traj_enc_params']['stride'], # ) # Dc2R_map = Dc2RMap( # variant['algo_params']['np_params']['Dc2r_params']['agg_type'], # traj_enc, # state_only=False # ) # r2z_map = R2ZMap( # variant['algo_params']['np_params']['r2z_params']['num_layers'], # variant['algo_params']['np_params']['traj_enc_params']['channels'], # variant['algo_params']['np_params']['r2z_params']['hid_dim'], # variant['algo_params']['z_dim'] # ) # encoder = NPEncoder( # Dc2R_map, # r2z_map, # ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) algorithm = NeuralProcessBC( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) input_dim = obs_dim + action_dim if not variant['ebm_params']['state_only'] else 2*obs_dim # build the energy model if (variant['ebm_params']['mode']) == 'deen': ebm_model = MLPEBM( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'] ) algorithm = EBMLearn( env=env, training_env=training_env, ebm=ebm_model, input_dim = input_dim, exploration_policy=policy, sigma=variant['sigma'], expert_replay_buffer=expert_replay_buffer, **variant['ebm_params'] ) # build the energy model elif (variant['ebm_params']['mode']) == 'ae': ebm_model = MLPAE( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], ) algorithm = EBMLearn( env=env, training_env=training_env, ebm=ebm_model, input_dim = input_dim, exploration_policy=policy, sigma=None, expert_replay_buffer=expert_replay_buffer, **variant['ebm_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] elif variant['ebil_params']['mode'] == 'ae': ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError print("loaded EBM from {}".format(load_ebm_path)) # Test if variant['test']: batch_data = expert_replay_buffer.random_batch( 100, keys=['observations', 'actions']) print('ebm_obs: ', np.mean(batch_data['observations'], axis=0)) obs = torch.Tensor(batch_data['observations']) acts = torch.Tensor(batch_data['actions']) exp_input = torch.cat([obs, acts], dim=1).to(ptu.device) print("Not expert data", ebm_model(exp_input * 200).mean().item()) print("Expert data", ebm_model(exp_input).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm_pretrain = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], ebm=ebm_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm_pretrain.to(ptu.device) algorithm.to(ptu.device) else: algorithm_pretrain.to('cpu') algorithm.to('cpu') if variant['pretrain']: algorithm_pretrain.train() algorithm.train() return 1
def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] net_size = variant['net_size'] num_hidden = variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) trainer = SoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params'] ) algorithm = TorchRLAlgorithm( trainer=trainer, env=env, training_env=training_env, exploration_policy=policy, **variant['rl_alg_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): # get the expert data with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) expert_buffer = extra_data['train'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # seed the env env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, batch_norm=variant['policy_uses_bn'], layer_norm=variant['policy_uses_layer_norm']) # policy = MlpPolicy( # hidden_sizes=hidden_sizes, # obs_dim=obs_dim, # action_dim=action_dim, # batch_norm=variant['policy_uses_bn'], # layer_norm=variant['policy_uses_layer_norm'] # ) # set up the AIRL algorithm algorithm = BC(env, policy, expert_buffer, training_env=training_env, wrap_absorbing=variant['wrap_absorbing_state'], **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): env_specs = variant['env_specs'] if variant['algo_params']['meta']: env, training_env = get_meta_env(env_specs) else: if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) if variant['algo_params']['meta']: train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) print(env.observation_space) if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: obs_dim += int( np.prod( env.observation_space.spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) net_size = variant['net_size'] hidden_sizes = [net_size] * variant['num_hidden_layers'] if variant['use_custom_ant_models']: assert isinstance(env.observation_space, Dict) print('CUSTOM ANT WITH LINEAR EMBEDDING OF THE TARGET POSITION') qf1 = AntRandGoalCustomQFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)) + action_dim, output_size=1, ) qf2 = AntRandGoalCustomQFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)) + action_dim, output_size=1, ) vf = AntRandGoalCustomVFunc( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, input_size=int(np.prod(env.observation_space.spaces['obs'].shape)), output_size=1, ) policy = AntRandGoalCustomReparamTanhMultivariateGaussianPolicy( int(np.prod( env.observation_space.spaces['obs_task_params'].shape)), variant['goal_embed_dim'], hidden_sizes=hidden_sizes, obs_dim=int(np.prod(env.observation_space.spaces['obs'].shape)), action_dim=action_dim, ) # CUSTOM ANT WITH GATING ACTIVATIONS OF EACH LAYER # qf1 = AntCustomGatingQFuncV1() # qf2 = AntCustomGatingQFuncV1() # vf = AntCustomGatingVFuncV1() # policy = AntCustomGatingV1ReparamTanhMultivariateGaussianPolicy() else: print('Using simple model') qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) if variant['algo_params']['meta']: algorithm = MetaNewSoftActorCritic( env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, true_env_obs_dim=int( np.prod(env.observation_space.spaces['obs'].shape)), **variant['algo_params']) else: algorithm = NewSoftActorCritic(env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) input_dim = len(state_indices) # build the energy model if (variant['ebm_params']['mode']) == 'deen': ebm_model = MLPEBM(input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude']) algorithm = EBMLearn(env=env, training_env=training_env, ebm=ebm_model, input_dim=input_dim, exploration_policy=policy, sigma=variant['sigma'], target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebm_params']) # build the energy model elif (variant['ebm_params']['mode']) == 'ae': ebm_model = MLPAE( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], ) algorithm = EBMLearn(env=env, training_env=training_env, ebm=ebm_model, input_dim=input_dim, exploration_policy=policy, sigma=None, rescale=variant['rescale'], target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebm_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): # env = NormalizedBoxEnv(HalfCheetahEnv()) # env = NormalizedBoxEnv(InvertedPendulumEnv()) # --------- # env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # training_env = NormalizedBoxEnv(get_meta_env(variant['env_specs'])) # env = ReacherEnv() # training_env = ReacherEnv() # env = NormalizedBoxEnv(ReacherEnv()) # training_env = NormalizedBoxEnv(ReacherEnv()) # Or for a specific version: # import gym # env = NormalizedBoxEnv(gym.make('HalfCheetah-v1')) # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) print(env.observation_space) obs_space = env.observation_space if isinstance(env.observation_space, Dict): # possible keys: pixel, obs, obs_task_params if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(obs_space.spaces['obs'].shape)) else: raise NotImplementedError() if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(obs_space.spaces['obs_task_params'].shape)) else: raise NotImplementedError else: # OpenAI Gym Env or DMCS Env with only one obs obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) # if variant['reload_policy_from'] != '': # params = joblib.load(variant['reload_policy_from']) # qf1, qf2, vf, policy = params['qf1'], params['qf2'], params['vf'], params['policy'] # else: net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = NewSoftActorCritic(env=env, training_env=training_env, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, within_traj_agg=variant['algo_params']['within_traj_agg'], state_only=False) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = PEARL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, qf1, qf2, vf, encoder, # z_dim, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning AIRL train_context_buffer, train_test_buffer = extra_data['meta_train'][ 'context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test'][ 'context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) meta_train_env.seed(variant['seed']) meta_test_env.seed(variant['seed']) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError( 'Not implemented pixel version of things!') else: obs_dim = int( np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the disc model if variant['algo_params']['state_only']: print('\n\nUSING STATE ONLY DISC\n\n') disc_model = StandardMetaDisc( 2 * obs_dim + action_dim + variant['algo_params']['z_dim'] if not variant['algo_params']['state_only'] else 2 * obs_dim + variant['algo_params']['z_dim'], num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) if variant['algo_params']['use_target_disc']: target_disc = disc_model.copy() else: target_disc = None print(disc_model) print(disc_model.clamp_magnitude) z_dim = variant['algo_params']['z_dim'] policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim + z_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, wrap_absorbing=variant['algo_params']['wrap_absorbing'], **variant['policy_params']) # make the encoder encoder = TimestepBasedEncoder( 2 * obs_dim + action_dim, #(s,a,s') variant['algo_params']['r_dim'], variant['algo_params']['z_dim'], variant['algo_params']['enc_hid_dim'], variant['algo_params']['r2z_hid_dim'], variant['algo_params']['num_enc_layer_blocks'], hid_act='relu', use_bn=True, ) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters( env_specs) algorithm = MetaFAIRL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, encoder, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, target_disc=target_disc, **variant['algo_params']) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] print(demos_path) buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] # target_state_buffer /= variant['rescale'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) load_ebm_dir = ebm_dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError # Test if variant['test']: batch_data = target_state_buffer / variant['rescale'] obs = torch.Tensor(batch_data[:1000]).to(ptu.device) print("Not expert data", ebm_model(obs * 200).mean().item()) print("Expert data", ebm_model(obs).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], rescale=variant['rescale'], ebm=ebm_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning GAIL expert_buffer = extra_data['train'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) # set up the discriminator models disc_model = StandardAIRLDisc( obs_dim + action_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the AdvBC algorithm algorithm = AdvBC(env, policy, disc_model, expert_buffer, training_env=training_env, wrap_absorbing=variant['wrap_absorbing_state'], **variant['algo_params']) print(algorithm.use_target_disc) print(algorithm.soft_target_disc_tau) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.disc_optimizer.defaults['lr']) print(algorithm.policy_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): print('RUNNING') # we have to generate the combinations for the env_specs env_specs = variant['env_specs'] env_sampler = OnTheFlyEnvSampler(env_specs) # Build the normalizer for the env params env_spec_ranges = {} for k, v in env_specs.items(): if isinstance(v, list): env_spec_ranges[k] = v mean = [] half_diff = [] for k in sorted(env_spec_ranges.keys()): r = env_spec_ranges[k] mean.append((r[0] + r[1]) / 2.0) half_diff.append((r[1] - r[0]) / 2.0) mean = np.array(mean) half_diff = np.array(half_diff) def env_params_normalizer(params): return (params - mean) / half_diff variant['algo_params']['env_params_normalizer'] = env_params_normalizer # set up similar to non-meta version sample_env, _ = env_sampler() if variant['algo_params']['concat_env_params_to_obs']: meta_params_dim = sample_env.env_meta_params.shape[0] else: meta_params_dim = 0 obs_dim = int(np.prod(sample_env.observation_space.shape)) action_dim = int(np.prod(sample_env.action_space.shape)) net_size = variant['net_size'] qf1 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + action_dim + meta_params_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=[net_size, net_size], input_size=obs_dim + meta_params_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=[net_size, net_size], obs_dim=obs_dim + meta_params_dim, action_dim=action_dim, ) algorithm = NewMetaSoftActorCritic(env_sampler=env_sampler, policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']) # assert False, "Have not added new sac yet!" if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): expert_buffer = joblib.load(variant['xy_data_path'])['xy_data'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert False assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( # policy = ReparamMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, # std=0.1 ) # set up the discriminator models disc_model_class = ThreeWayResNetAIRLDisc if variant[ 'threeway'] else ResNetAIRLDisc disc_model = disc_model_class( 2, # obs is just x-y pos num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the RL algorithm used to train the policy policy_optimizer = EntConstSAC(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, action_dim=action_dim, **variant['policy_params']) # set up the AIRL algorithm alg_class = ThreewayStateMarginalMatchingAlg if variant[ 'threeway'] else StateMarginalMatchingAlg algorithm = alg_class(env, policy, disc_model, policy_optimizer, expert_buffer, training_env=training_env, **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.policy_optimizer.policy_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr']) print(algorithm.disc_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(demos_path) target_state_buffer = buffer_save_dict['data'] state_indices = torch.LongTensor(variant['state_indices']) env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model if variant['disc_model_type'] == 'resnet_disc': disc_model = ResNetAIRLDisc( len(state_indices), num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) else: disc_model = MLPDisc(len(state_indices), num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvSMM(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, target_state_buffer=target_state_buffer, state_indices=state_indices, **variant['adv_smm_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning GAIL train_context_buffer, train_test_buffer = extra_data['meta_train']['context'], extra_data['meta_train']['test'] test_context_buffer, test_test_buffer = extra_data['meta_test']['context'], extra_data['meta_test']['test'] # set up the envs env_specs = variant['env_specs'] meta_train_env, meta_test_env = get_meta_env(env_specs) # set up the policy and training algorithm if isinstance(meta_train_env.observation_space, Dict): if variant['algo_params']['policy_uses_pixels']: raise NotImplementedError('Not implemented pixel version of things!') else: obs_dim = int(np.prod(meta_train_env.observation_space.spaces['obs'].shape)) else: obs_dim = int(np.prod(meta_train_env.observation_space.shape)) action_dim = int(np.prod(meta_train_env.action_space.shape)) print('obs dim: %d' % obs_dim) print('act dim: %d' % action_dim) sleep(3) # make the policy and policy optimizer hidden_sizes = [variant['algo_params']['policy_net_size']] * variant['algo_params']['policy_num_layers'] z_dim = variant['algo_params']['np_params']['z_dim'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + z_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim + z_dim, action_dim=action_dim, ) # disc_model = GAILDiscModel(obs_dim + action_dim + z_dim, hid_dim=variant['algo_params']['disc_net_size']) disc_model = MlpGAILDisc( hidden_sizes=variant['disc_hidden_sizes'], output_size=1, input_size=obs_dim + action_dim + z_dim, hidden_activation=torch.nn.functional.tanh, layer_norm=variant['disc_uses_layer_norm'] # output_activation=identity, ) policy_optimizer = NewSoftActorCritic( policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['algo_params']['policy_params'] ) # Make the neural process # in the initial version we are assuming all trajectories have the same length timestep_enc_params = variant['algo_params']['np_params']['traj_enc_params']['timestep_enc_params'] traj_enc_params = variant['algo_params']['np_params']['traj_enc_params']['traj_enc_params'] timestep_enc_params['input_size'] = obs_dim + action_dim traj_samples, _ = train_context_buffer.sample_trajs(1, num_tasks=1) # len_context_traj = traj_samples[0][0]['observations'].shape[0] len_context_traj = 5 traj_enc_params['input_size'] = timestep_enc_params['output_size'] * len_context_traj traj_enc = TrivialTrajEncoder( timestep_enc_params, traj_enc_params ) trunk_params = variant['algo_params']['np_params']['r2z_map_params']['trunk_params'] trunk_params['input_size'] = traj_enc.output_size split_params = variant['algo_params']['np_params']['r2z_map_params']['split_heads_params'] split_params['input_size'] = trunk_params['output_size'] split_params['output_size'] = variant['algo_params']['np_params']['z_dim'] r2z_map = TrivialR2ZMap( trunk_params, split_params ) np_enc = TrivialNPEncoder( variant['algo_params']['np_params']['np_enc_params']['agg_type'], traj_enc, r2z_map ) # class StupidDistFormat(): # def __init__(self, var): # self.mean = var # class ZeroModule(nn.Module): # def __init__(self, z_dim): # super().__init__() # self.z_dim = z_dim # self.fc = nn.Linear(10,10) # def forward(self, context): # c_len = len(context) # return StupidDistFormat(Variable(torch.zeros(c_len, self.z_dim), requires_grad=False)) # np_enc = ZeroModule(variant['algo_params']['np_params']['z_dim']) train_task_params_sampler, test_task_params_sampler = get_meta_env_params_iters(env_specs) algorithm = NeuralProcessAIRL( meta_test_env, # env is the test env, training_env is the training env (following rlkit original setup) policy, disc_model, train_context_buffer, train_test_buffer, test_context_buffer, test_test_buffer, np_enc, policy_optimizer, training_env=meta_train_env, # the env used for generating trajectories train_task_params_sampler=train_task_params_sampler, test_task_params_sampler=test_task_params_sampler, **variant['algo_params'] ) if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model disc_model = MLPDisc( obs_dim + action_dim if not variant['adv_irl_params']['state_only'] else 2 * obs_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvIRL(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['adv_irl_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1