def experiment(variant): env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) policy = joblib.load(variant['policy_checkpoint'])['exploration_policy'] if variant['eval_deterministic']: policy = MakeDeterministic(policy) policy.to(ptu.device) eval_sampler = PathSampler(env, policy, variant['num_eval_steps'], variant['max_path_length'], no_terminal=variant['no_terminal'], render=variant['render'], render_kwargs=variant['render_kwargs']) test_paths = eval_sampler.obtain_samples() average_returns = eval_util.get_average_returns(test_paths) print(average_returns) return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the energy model if variant['ebil_params']['mode'] == 'deen': """ ebm_model = MLPEBM( obs_dim + action_dim if not variant['ebil_params']['state_only'] else 2*obs_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'], ) """ ebm_exp_name = 'ebm-deen-' + variant['env_specs'][ 'env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] elif variant['ebil_params']['mode'] == 'ae': ebm_exp_name = 'ebm-ae-' + variant['env_specs']['env_name'] + '-' + str( variant['expert_traj_num']) + '-train--sigma-' + str( variant['ebm_sigma']) ebm_dir = os.path.join(config.LOCAL_LOG_DIR, ebm_exp_name) ebm_id_dirs = os.listdir(ebm_dir) ebm_id_dirs = sorted( ebm_id_dirs, key=lambda x: os.path.getmtime(os.path.join(ebm_dir, x))) load_ebm_dir = os.path.join( ebm_dir, ebm_id_dirs[-1]) # Choose the last as the load ebm dir load_epoch = variant['ebm_epoch'] load_name = 'itr_{}.pkl'.format(load_epoch) if load_epoch == 'best': load_name = 'best.pkl' load_ebm_path = os.path.join(load_ebm_dir, load_name) load_ebm_pkl = joblib.load(load_ebm_path, mmap_mode='r') ebm_model = load_ebm_pkl['ebm'] else: raise NotImplementedError print("loaded EBM from {}".format(load_ebm_path)) # Test if variant['test']: batch_data = expert_replay_buffer.random_batch( 100, keys=['observations', 'actions']) print('ebm_obs: ', np.mean(batch_data['observations'], axis=0)) obs = torch.Tensor(batch_data['observations']) acts = torch.Tensor(batch_data['actions']) exp_input = torch.cat([obs, acts], dim=1).to(ptu.device) print("Not expert data", ebm_model(exp_input * 200).mean().item()) print("Expert data", ebm_model(exp_input).mean().item()) exit(1) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm_pretrain = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) algorithm = EBIL(env=env, training_env=training_env, exploration_policy=policy, rew_func=variant['rew_func'], cons=variant['cons'], ebm=ebm_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['ebil_params']) if ptu.gpu_enabled(): algorithm_pretrain.to(ptu.device) algorithm.to(ptu.device) else: algorithm_pretrain.to('cpu') algorithm.to('cpu') if variant['pretrain']: algorithm_pretrain.train() algorithm.train() return 1
def experiment(variant): expert_buffer = joblib.load(variant['xy_data_path'])['xy_data'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert False assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf1 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) target_qf2 = FlattenMlp( hidden_sizes=hidden_sizes, input_size=obs_dim + action_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( # policy = ReparamMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, # std=0.1 ) # set up the discriminator models disc_model_class = ThreeWayResNetAIRLDisc if variant[ 'threeway'] else ResNetAIRLDisc disc_model = disc_model_class( 2, # obs is just x-y pos num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the RL algorithm used to train the policy policy_optimizer = EntConstSAC(policy=policy, qf1=qf1, qf2=qf2, target_qf1=target_qf1, target_qf2=target_qf2, action_dim=action_dim, **variant['policy_params']) # set up the AIRL algorithm alg_class = ThreewayStateMarginalMatchingAlg if variant[ 'threeway'] else StateMarginalMatchingAlg algorithm = alg_class(env, policy, disc_model, policy_optimizer, expert_buffer, training_env=training_env, **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.policy_optimizer.policy_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf1_optimizer.defaults['lr']) print(algorithm.policy_optimizer.qf2_optimizer.defaults['lr']) print(algorithm.disc_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) # this script is for the non-meta-learning GAIL expert_buffer = extra_data['train'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, ) # set up the discriminator models disc_model = StandardAIRLDisc( obs_dim + action_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) print(disc_model) print(disc_model.clamp_magnitude) # set up the AdvBC algorithm algorithm = AdvBC(env, policy, disc_model, expert_buffer, training_env=training_env, wrap_absorbing=variant['wrap_absorbing_state'], **variant['algo_params']) print(algorithm.use_target_disc) print(algorithm.soft_target_disc_tau) print(algorithm.exploration_policy) print(algorithm.eval_policy) print(algorithm.disc_optimizer.defaults['lr']) print(algorithm.policy_optimizer.defaults['lr']) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) algorithm = BC(env=env, training_env=training_env, exploration_policy=policy, expert_replay_buffer=expert_replay_buffer, **variant['bc_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the critic model critic_model = MLPDisc(variant['policy_net_size'], num_layer_blocks=variant['critic_num_blocks'], hid_dim=variant['critic_hid_dim'], hid_act=variant['critic_hid_act'], use_bn=variant['critic_use_bn']) algorithm = BC(env=env, training_env=training_env, exploration_policy=policy, critic=critic_model, expert_replay_buffer=expert_replay_buffer, **variant['adp_bc_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
from rlkit.samplers import PathSampler from rlkit.torch.sac.policies import MakeDeterministic from rlkit.envs.wrappers import ScaledEnv env_specs = {'env_name': 'halfcheetah', 'env_kwargs': {}, 'eval_env_seed': 3562} env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings['norm_halfcheetah_32_demos_sub_20']['file_paths'][0] buffer_save_dict = joblib.load(expert_demos_path) env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) bc_policy = joblib.load('/scratch/hdd001/home/kamyar/output/paper-version-hc-bc/paper_version_hc_bc_2019_05_19_00_32_05_0000--s-0/params.pkl')['exploration_policy'] bc_policy = MakeDeterministic(bc_policy) bc_policy.to(ptu.device) dagger_policy = joblib.load('/scratch/hdd001/home/kamyar/output/dagger-halfcheetah/dagger_halfcheetah_2019_08_20_16_30_36_0000--s-0/params.pkl')['exploration_policy'] dagger_policy = MakeDeterministic(dagger_policy) dagger_policy.to(ptu.device) irl_policy = joblib.load('/scratch/hdd001/home/kamyar/output/hc_airl_ckpt/params.pkl')['exploration_policy'] irl_policy = MakeDeterministic(irl_policy) irl_policy.to(ptu.device)
def experiment(variant): # get the expert data with open(EXPERT_LISTING_YAML_PATH, 'r') as f: listings = yaml.load(f.read()) expert_dir = listings[variant['expert_name']]['exp_dir'] specific_run = listings[variant['expert_name']]['seed_runs'][ variant['expert_seed_run_idx']] file_to_load = path.join(expert_dir, specific_run, 'extra_data.pkl') extra_data = joblib.load(file_to_load) expert_buffer = extra_data['train'] # set up the env env_specs = variant['env_specs'] if env_specs['train_test_env']: env, training_env = get_env(env_specs) else: env, _ = get_env(env_specs) training_env, _ = get_env(env_specs) if variant['scale_env_with_given_demo_stats']: assert not env_specs['normalized'] env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) # seed the env env.seed(variant['seed']) training_env.seed(variant['seed']) print(env.observation_space) # compute obs_dim and action_dim if isinstance(env.observation_space, Dict): if not variant['algo_params']['policy_uses_pixels']: obs_dim = int(np.prod(env.observation_space.spaces['obs'].shape)) if variant['algo_params']['policy_uses_task_params']: if variant['algo_params']['concat_task_params_to_policy_obs']: obs_dim += int( np.prod(env.observation_space. spaces['obs_task_params'].shape)) else: raise NotImplementedError() else: raise NotImplementedError() else: obs_dim = int(np.prod(env.observation_space.shape)) action_dim = int(np.prod(env.action_space.shape)) print(obs_dim, action_dim) sleep(3) # set up the policy models policy_net_size = variant['policy_net_size'] hidden_sizes = [policy_net_size] * variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=hidden_sizes, obs_dim=obs_dim, action_dim=action_dim, batch_norm=variant['policy_uses_bn'], layer_norm=variant['policy_uses_layer_norm']) # policy = MlpPolicy( # hidden_sizes=hidden_sizes, # obs_dim=obs_dim, # action_dim=action_dim, # batch_norm=variant['policy_uses_bn'], # layer_norm=variant['policy_uses_layer_norm'] # ) # set up the AIRL algorithm algorithm = BC(env, policy, expert_buffer, training_env=training_env, wrap_absorbing=variant['wrap_absorbing_state'], **variant['algo_params']) print(algorithm.exploration_policy) print(algorithm.eval_policy) # train if ptu.gpu_enabled(): algorithm.cuda() algorithm.train() return 1
EVAL_DETERMINISTIC = False N_ROLLOUTS = 10 print('EVAL_DETERMINISTIC: {}\n'.format(EVAL_DETERMINISTIC)) # exp_path = '/scratch/hdd001/home/kamyar/output/super-hype-search-fairl-ant-4-demos' exp_path = '/scratch/hdd001/home/kamyar/output/super-hype-search-airl-ant-32-demos/' print(exp_path) env = AntEnv() extra_data = joblib.load( '/scratch/hdd001/home/kamyar/expert_demos/norm_ant_32_demos_20_subsampling/extra_data.pkl' ) env = ScaledEnv( env, obs_mean=extra_data['obs_mean'], obs_std=extra_data['obs_std'], acts_mean=extra_data['acts_mean'], acts_std=extra_data['acts_std'], ) all_returns = defaultdict(list) last_time = time.time() for sub_exp in os.listdir(exp_path): try: policy = joblib.load(osp.join(exp_path, sub_exp, 'params.pkl'))['policy'] with open(osp.join(exp_path, sub_exp, 'variant.json'), 'r') as f: sub_exp_specs = json.load(f) except: continue
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] if 'minmax_env_with_demo_stats' in variant.keys(): if variant['minmax_env_with_demo_stats']: print('Use minmax envs') assert 'norm_train' in buffer_save_dict.keys() expert_replay_buffer = buffer_save_dict['norm_train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) elif variant['minmax_env_with_demo_stats']: env = MinmaxEnv( env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) training_env = MinmaxEnv( training_env, obs_min=buffer_save_dict['obs_min'], obs_max=buffer_save_dict['obs_max'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) input_dim = obs_dim + action_dim if not variant['ebm_params']['state_only'] else 2*obs_dim # build the energy model if (variant['ebm_params']['mode']) == 'deen': ebm_model = MLPEBM( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], clamp_magnitude=variant['ebm_clamp_magnitude'] ) algorithm = EBMLearn( env=env, training_env=training_env, ebm=ebm_model, input_dim = input_dim, exploration_policy=policy, sigma=variant['sigma'], expert_replay_buffer=expert_replay_buffer, **variant['ebm_params'] ) # build the energy model elif (variant['ebm_params']['mode']) == 'ae': ebm_model = MLPAE( input_dim, num_layer_blocks=variant['ebm_num_blocks'], hid_dim=variant['ebm_hid_dim'], hid_act=variant['ebm_hid_act'], use_bn=variant['ebm_use_bn'], ) algorithm = EBMLearn( env=env, training_env=training_env, ebm=ebm_model, input_dim = input_dim, exploration_policy=policy, sigma=None, expert_replay_buffer=expert_replay_buffer, **variant['ebm_params'] ) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1
def experiment(variant): with open('expert_demos_listing.yaml', 'r') as f: listings = yaml.load(f.read()) expert_demos_path = listings[variant['expert_name']]['file_paths'][ variant['expert_idx']] buffer_save_dict = joblib.load(expert_demos_path) expert_replay_buffer = buffer_save_dict['train'] env_specs = variant['env_specs'] env = get_env(env_specs) env.seed(env_specs['eval_env_seed']) training_env = get_env(env_specs) training_env.seed(env_specs['training_env_seed']) print('\n\nEnv: {}'.format(env_specs['env_name'])) print('kwargs: {}'.format(env_specs['env_kwargs'])) print('Obs Space: {}'.format(env.observation_space)) print('Act Space: {}\n\n'.format(env.action_space)) if variant['scale_env_with_demo_stats']: env = ScaledEnv( env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) training_env = ScaledEnv( training_env, obs_mean=buffer_save_dict['obs_mean'], obs_std=buffer_save_dict['obs_std'], acts_mean=buffer_save_dict['acts_mean'], acts_std=buffer_save_dict['acts_std'], ) obs_space = env.observation_space act_space = env.action_space assert not isinstance(obs_space, Dict) assert len(obs_space.shape) == 1 assert len(act_space.shape) == 1 obs_dim = obs_space.shape[0] action_dim = act_space.shape[0] # build the policy models net_size = variant['policy_net_size'] num_hidden = variant['policy_num_hidden_layers'] qf1 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) qf2 = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim + action_dim, output_size=1, ) vf = FlattenMlp( hidden_sizes=num_hidden * [net_size], input_size=obs_dim, output_size=1, ) policy = ReparamTanhMultivariateGaussianPolicy( hidden_sizes=num_hidden * [net_size], obs_dim=obs_dim, action_dim=action_dim, ) # build the discriminator model disc_model = MLPDisc( obs_dim + action_dim if not variant['adv_irl_params']['state_only'] else 2 * obs_dim, num_layer_blocks=variant['disc_num_blocks'], hid_dim=variant['disc_hid_dim'], hid_act=variant['disc_hid_act'], use_bn=variant['disc_use_bn'], clamp_magnitude=variant['disc_clamp_magnitude']) # set up the algorithm trainer = SoftActorCritic(policy=policy, qf1=qf1, qf2=qf2, vf=vf, **variant['sac_params']) algorithm = AdvIRL(env=env, training_env=training_env, exploration_policy=policy, discriminator=disc_model, policy_trainer=trainer, expert_replay_buffer=expert_replay_buffer, **variant['adv_irl_params']) if ptu.gpu_enabled(): algorithm.to(ptu.device) algorithm.train() return 1