def run(reward_path, alpha=0.01, prior_reward_weight=0.1): seed = 0 system.reproduce(seed) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") env_kwargs = { "T": 30, "state_indices": [0, 1], 'size_x': 6, 'size_y': 6, 'prior_reward_weight': prior_reward_weight, } reward_func = None if reward_path is not None: dir_path = osp.dirname(osp.dirname(reward_path)) print("dir path is: ", dir_path) v_path = osp.join(dir_path, 'variant.json') v = json.load(open(v_path, 'r')) print(v) if use_reward(v['obj']): from firl.models.reward import MLPReward reward_kwargs = v['reward'] reward_func = MLPReward(len(env_kwargs['state_indices']), **reward_kwargs, device=device) reward_func.load_state_dict(torch.load(reward_path)) reward_func.to(device) else: from firl.models.discrim import ResNetAIRLDisc dis_kwargs = v['disc'] dis_kwargs.update({ 'state_indices': [0, 1], 'rew_clip_min': -10.0, 'rew_clip_max': 10.0, 'reward_scale': 1.0, }) discriminator = ResNetAIRLDisc(len(env_kwargs['state_indices']), **dis_kwargs, device=device).to(device) discriminator.load_state_dict(torch.load(reward_path)) print("discriminator loaded!") epoch = reward_path[reward_path.rfind('_') + 1:-4] print("reward epoch is: ", epoch) agent_path = osp.join(dir_path, 'model', f'agent_epoch_{epoch}.pkl') fake_env_func = lambda: gym.make(v['env']['env_name'], **v['env']) sac_agent = SAC(fake_env_func, None, k=1) sac_agent.ac.load_state_dict(torch.load(agent_path)) print('sac agent loaded!') reward_func = Discriminator_reward(discriminator, mode='airl', device=device, agent=sac_agent, **dis_kwargs) save_name = 'no_prior' if reward_path is None else v['obj'] if os.path.exists( f'./data/prior_reward/potential/{save_name}_{alpha}_{prior_reward_weight}_sac_test_rets.npy' ): print("already obtained") return env = gym.make("GoalGrid-v0") reward_func = reward_func.get_scalar_reward if reward_func is not None else None env_fn = lambda: gym.make("GoalGrid-v0", r=reward_func, **env_kwargs) state_size = env.observation_space.shape[0] action_size = env.action_space.shape[0] sac_kwargs = { 'epochs': 270, # to use AIRL training schedual, change to 60k/30 :2000 'steps_per_epoch': env_kwargs['T'], 'log_step_interval': env_kwargs[ 'T'], # to use AIRL training schedual, can change log frequency to be larger, e.g. 300 'update_every': 1, # update frequency. to use AIRL training schedule, change to 300 'update_num': 1, # how many update steps at each update time. to use AIRL training schedule, change to 20. 'random_explore_episodes': 100, # to use AIRL training schedule, change to 35 or 33. roughly 1000 steps. 'batch_size': 256, # 64 'lr': 0.003, # 3e-3 'alpha': alpha, 'automatic_alpha_tuning': False, 'reinitialize': True, 'buffer_size': 12000, } replay_buffer = ReplayBuffer(state_size, action_size, device=device, size=sac_kwargs['buffer_size']) sac_agent = SAC( env_fn, replay_buffer, update_after=env_kwargs['T'] * sac_kwargs['random_explore_episodes'], max_ep_len=env_kwargs['T'], seed=seed, start_steps=env_kwargs['T'] * sac_kwargs['random_explore_episodes'], reward_state_indices=env_kwargs['state_indices'], device=device, k=1, **sac_kwargs) if reward_path is not None and 'agent' in reward_path: sac_agent.ac.load_state_dict(torch.load(reward_path)) print("sac agent loaded!") sac_test_rets, sac_alphas, sac_log_pis, sac_test_timestep = sac_agent.learn( n_parallel=1, print_out=True) if not osp.exists('data/prior_reward/potential'): os.makedirs('data/prior_reward/potential') np.save( f'./data/prior_reward/potential/{save_name}_{alpha}_{prior_reward_weight}_sac_test_rets.npy', np.asarray(sac_test_rets)) np.save( f'./data/prior_reward/potential/{save_name}_{alpha}_{prior_reward_weight}_sac_time_steps.npy', np.asarray(sac_test_timestep))
if __name__ == "__main__": yaml = YAML() v = yaml.load(open(sys.argv[1])) # common parameters env_name, env_T = v['env']['env_name'], v['env']['T'] state_indices = v['env']['state_indices'] seed = v['seed'] # system: device, threads, seed, pid device = torch.device(f"cuda:{v['cuda']}" if torch.cuda.is_available() and v['cuda'] >= 0 else "cpu") torch.set_num_threads(1) np.set_printoptions(precision=3, suppress=True) system.reproduce(seed) # environment env_fn = lambda: gym.make(env_name) gym_env = env_fn() state_size = gym_env.observation_space.shape[0] action_size = gym_env.action_space.shape[0] if state_indices == 'all': state_indices = list(range(state_size)) # load reward assert 'reward' in v reward_func = MLPReward(len(state_indices), **v['reward'], device=device).to(device) reward_func.load_state_dict(torch.load(v['reward']['path'])) reward_name = "-".join(v['reward']['path'].split('/'))
def get_policy(): yaml = YAML() # root_dir = '/home/ankur/MSR_Research_Home/Actor-Residual-Critic/logs/PlanarReachGoal1DenseFH-v0/exp-64/arc-f-max-rkl/2021_08_18_01_57_16' # config_file = 'variant_21139.yml' # ckpt_file = 'env_steps_9000.pt' root_dir = '/home/ankur/MSR_Research_Home/Actor-Residual-Critic/logs_ava/PlanarPushGoal1DenseFH-v0/exp-64/f-max-rkl/2021_08_19_01_52_10' config_file = 'variant_51349.yml' ckpt_file = 'env_steps_500000.pt' v = yaml.load(open(os.path.join(root_dir, config_file))) # common parameters env_name = v['env']['env_name'] env_T = v['env']['T'] state_indices = v['env']['state_indices'] seed = v['seed'] num_expert_trajs = v['irl']['expert_episodes'] # system: device, threads, seed, pid device = torch.device(f"cuda:{v['cuda']}" if torch.cuda.is_available() and v['cuda'] >= 0 else "cpu") print('Device is', device) torch.set_num_threads(1) np.set_printoptions(precision=3, suppress=True) system.reproduce(seed) pid = os.getpid() # assumptions assert v['obj'] in [ 'f-max-rkl', 'arc-f-max-rkl', 'gail', 'arc-gail', 'fairl', 'arc-fairl', 'airl', 'arc-airl', 'naive-diff-gail', 'naive-diff-f-max-rkl' ] # approximate [RKL, JSD, FKL, RKL] # environment env_fn = lambda: gym.make(env_name) gym_env = env_fn() state_size = gym_env.observation_space.shape[0] action_size = gym_env.action_space.shape[0] if state_indices == 'all': state_indices = list(range(state_size)) if v['adv_irl']['normalize']: expert_samples_ = expert_trajs.copy().reshape(-1, len(state_indices)) obs_mean, obs_std = expert_samples_.mean(0), expert_samples_.std(0) obs_std[obs_std == 0.0] = 1.0 # avoid constant distribution expert_samples = (expert_samples - obs_mean) / obs_std # normalize expert data print('obs_mean, obs_std', obs_mean, obs_std) env_fn = lambda: gym.make(env_name, obs_mean=obs_mean, obs_std=obs_std) if v['obj'] in [ 'arc-f-max-rkl', 'arc-gail', 'arc-airl', 'arc-fairl', 'naive-diff-gail', 'naive-diff-f-max-rkl' ]: agent = SARC(env_fn, None, steps_per_epoch=v['env']['T'], max_ep_len=v['env']['T'], seed=seed, reward_state_indices=state_indices, device=device, objective=v['obj'], reward_scale=v['adv_irl']['reward_scale'], **v['sac']) else: agent = SAC(env_fn, None, steps_per_epoch=v['env']['T'], max_ep_len=v['env']['T'], seed=seed, reward_state_indices=state_indices, device=device, **v['sac']) agent.test_fn = agent.test_agent_ori_env agent.ac.load_state_dict( torch.load(os.path.join(root_dir, 'agent', ckpt_file))) policy = agent.get_action return policy