Пример #1
0
def run(reward_path, alpha=0.01, prior_reward_weight=0.1):
    seed = 0
    system.reproduce(seed)
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    env_kwargs = {
        "T": 30,
        "state_indices": [0, 1],
        'size_x': 6,
        'size_y': 6,
        'prior_reward_weight': prior_reward_weight,
    }

    reward_func = None
    if reward_path is not None:
        dir_path = osp.dirname(osp.dirname(reward_path))
        print("dir path is: ", dir_path)
        v_path = osp.join(dir_path, 'variant.json')
        v = json.load(open(v_path, 'r'))
        print(v)

        if use_reward(v['obj']):
            from firl.models.reward import MLPReward
            reward_kwargs = v['reward']
            reward_func = MLPReward(len(env_kwargs['state_indices']),
                                    **reward_kwargs,
                                    device=device)
            reward_func.load_state_dict(torch.load(reward_path))
            reward_func.to(device)
        else:
            from firl.models.discrim import ResNetAIRLDisc
            dis_kwargs = v['disc']
            dis_kwargs.update({
                'state_indices': [0, 1],
                'rew_clip_min': -10.0,
                'rew_clip_max': 10.0,
                'reward_scale': 1.0,
            })
            discriminator = ResNetAIRLDisc(len(env_kwargs['state_indices']),
                                           **dis_kwargs,
                                           device=device).to(device)
            discriminator.load_state_dict(torch.load(reward_path))
            print("discriminator loaded!")

            epoch = reward_path[reward_path.rfind('_') + 1:-4]
            print("reward epoch is: ", epoch)
            agent_path = osp.join(dir_path, 'model',
                                  f'agent_epoch_{epoch}.pkl')
            fake_env_func = lambda: gym.make(v['env']['env_name'], **v['env'])
            sac_agent = SAC(fake_env_func, None, k=1)
            sac_agent.ac.load_state_dict(torch.load(agent_path))
            print('sac agent loaded!')

            reward_func = Discriminator_reward(discriminator,
                                               mode='airl',
                                               device=device,
                                               agent=sac_agent,
                                               **dis_kwargs)

    save_name = 'no_prior' if reward_path is None else v['obj']
    if os.path.exists(
            f'./data/prior_reward/potential/{save_name}_{alpha}_{prior_reward_weight}_sac_test_rets.npy'
    ):
        print("already obtained")
        return

    env = gym.make("GoalGrid-v0")
    reward_func = reward_func.get_scalar_reward if reward_func is not None else None
    env_fn = lambda: gym.make("GoalGrid-v0", r=reward_func, **env_kwargs)
    state_size = env.observation_space.shape[0]
    action_size = env.action_space.shape[0]

    sac_kwargs = {
        'epochs': 270,  # to use AIRL training schedual, change to 60k/30 :2000
        'steps_per_epoch': env_kwargs['T'],
        'log_step_interval': env_kwargs[
            'T'],  # to use AIRL training schedual, can change log frequency to be larger, e.g. 300
        'update_every':
        1,  # update frequency. to use AIRL training schedule, change to 300
        'update_num':
        1,  # how many update steps at each update time. to use AIRL training schedule, change to 20.
        'random_explore_episodes':
        100,  # to use AIRL training schedule, change to 35 or 33. roughly 1000 steps.
        'batch_size': 256,  # 64
        'lr': 0.003,  # 3e-3
        'alpha': alpha,
        'automatic_alpha_tuning': False,
        'reinitialize': True,
        'buffer_size': 12000,
    }

    replay_buffer = ReplayBuffer(state_size,
                                 action_size,
                                 device=device,
                                 size=sac_kwargs['buffer_size'])

    sac_agent = SAC(
        env_fn,
        replay_buffer,
        update_after=env_kwargs['T'] * sac_kwargs['random_explore_episodes'],
        max_ep_len=env_kwargs['T'],
        seed=seed,
        start_steps=env_kwargs['T'] * sac_kwargs['random_explore_episodes'],
        reward_state_indices=env_kwargs['state_indices'],
        device=device,
        k=1,
        **sac_kwargs)

    if reward_path is not None and 'agent' in reward_path:
        sac_agent.ac.load_state_dict(torch.load(reward_path))
        print("sac agent loaded!")

    sac_test_rets, sac_alphas, sac_log_pis, sac_test_timestep = sac_agent.learn(
        n_parallel=1, print_out=True)
    if not osp.exists('data/prior_reward/potential'):
        os.makedirs('data/prior_reward/potential')
    np.save(
        f'./data/prior_reward/potential/{save_name}_{alpha}_{prior_reward_weight}_sac_test_rets.npy',
        np.asarray(sac_test_rets))
    np.save(
        f'./data/prior_reward/potential/{save_name}_{alpha}_{prior_reward_weight}_sac_time_steps.npy',
        np.asarray(sac_test_timestep))
Пример #2
0
if __name__ == "__main__":
    yaml = YAML()
    v = yaml.load(open(sys.argv[1]))

    # common parameters
    env_name, env_T = v['env']['env_name'], v['env']['T']
    state_indices = v['env']['state_indices']
    seed = v['seed']

    # system: device, threads, seed, pid
    device = torch.device(f"cuda:{v['cuda']}" if torch.cuda.is_available()
                          and v['cuda'] >= 0 else "cpu")
    torch.set_num_threads(1)
    np.set_printoptions(precision=3, suppress=True)
    system.reproduce(seed)

    # environment
    env_fn = lambda: gym.make(env_name)
    gym_env = env_fn()
    state_size = gym_env.observation_space.shape[0]
    action_size = gym_env.action_space.shape[0]
    if state_indices == 'all':
        state_indices = list(range(state_size))

    # load reward
    assert 'reward' in v
    reward_func = MLPReward(len(state_indices), **v['reward'],
                            device=device).to(device)
    reward_func.load_state_dict(torch.load(v['reward']['path']))
    reward_name = "-".join(v['reward']['path'].split('/'))
Пример #3
0
def get_policy():
    yaml = YAML()
    # root_dir = '/home/ankur/MSR_Research_Home/Actor-Residual-Critic/logs/PlanarReachGoal1DenseFH-v0/exp-64/arc-f-max-rkl/2021_08_18_01_57_16'
    # config_file = 'variant_21139.yml'
    # ckpt_file = 'env_steps_9000.pt'
    root_dir = '/home/ankur/MSR_Research_Home/Actor-Residual-Critic/logs_ava/PlanarPushGoal1DenseFH-v0/exp-64/f-max-rkl/2021_08_19_01_52_10'
    config_file = 'variant_51349.yml'
    ckpt_file = 'env_steps_500000.pt'
    v = yaml.load(open(os.path.join(root_dir, config_file)))

    # common parameters
    env_name = v['env']['env_name']
    env_T = v['env']['T']
    state_indices = v['env']['state_indices']
    seed = v['seed']
    num_expert_trajs = v['irl']['expert_episodes']

    # system: device, threads, seed, pid
    device = torch.device(f"cuda:{v['cuda']}" if torch.cuda.is_available()
                          and v['cuda'] >= 0 else "cpu")
    print('Device is', device)
    torch.set_num_threads(1)
    np.set_printoptions(precision=3, suppress=True)
    system.reproduce(seed)
    pid = os.getpid()

    # assumptions
    assert v['obj'] in [
        'f-max-rkl', 'arc-f-max-rkl', 'gail', 'arc-gail', 'fairl', 'arc-fairl',
        'airl', 'arc-airl', 'naive-diff-gail', 'naive-diff-f-max-rkl'
    ]  # approximate [RKL, JSD, FKL, RKL]

    # environment
    env_fn = lambda: gym.make(env_name)
    gym_env = env_fn()
    state_size = gym_env.observation_space.shape[0]
    action_size = gym_env.action_space.shape[0]
    if state_indices == 'all':
        state_indices = list(range(state_size))

    if v['adv_irl']['normalize']:
        expert_samples_ = expert_trajs.copy().reshape(-1, len(state_indices))
        obs_mean, obs_std = expert_samples_.mean(0), expert_samples_.std(0)
        obs_std[obs_std == 0.0] = 1.0  # avoid constant distribution
        expert_samples = (expert_samples -
                          obs_mean) / obs_std  # normalize expert data
        print('obs_mean, obs_std', obs_mean, obs_std)
        env_fn = lambda: gym.make(env_name, obs_mean=obs_mean, obs_std=obs_std)

    if v['obj'] in [
            'arc-f-max-rkl', 'arc-gail', 'arc-airl', 'arc-fairl',
            'naive-diff-gail', 'naive-diff-f-max-rkl'
    ]:
        agent = SARC(env_fn,
                     None,
                     steps_per_epoch=v['env']['T'],
                     max_ep_len=v['env']['T'],
                     seed=seed,
                     reward_state_indices=state_indices,
                     device=device,
                     objective=v['obj'],
                     reward_scale=v['adv_irl']['reward_scale'],
                     **v['sac'])
    else:
        agent = SAC(env_fn,
                    None,
                    steps_per_epoch=v['env']['T'],
                    max_ep_len=v['env']['T'],
                    seed=seed,
                    reward_state_indices=state_indices,
                    device=device,
                    **v['sac'])

    agent.test_fn = agent.test_agent_ori_env
    agent.ac.load_state_dict(
        torch.load(os.path.join(root_dir, 'agent', ckpt_file)))
    policy = agent.get_action
    return policy