예제 #1
0
def experiment(variant):

    domain = variant['domain']
    seed = variant['seed']
    exp_mode = variant['exp_mode']
    max_path_length = variant['algo_params']['max_path_length']
    bcq_interactions = variant['bcq_interactions']
    num_tasks = variant['num_tasks']

    filename = f'./goals/{domain}-{exp_mode}-goals.pkl'
    idx_list, train_goals, wd_goals, ood_goals = pickle.load(
        open(filename, 'rb'))
    idx_list = idx_list[:num_tasks]

    sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}"
    buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir)

    print("Buffer directory: " + buffer_dir)

    # Load buffer
    bcq_buffers = []

    buffer_loader_id_list = []
    for i, idx in enumerate(idx_list):
        bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl'
        filename = os.path.join(buffer_dir, bname)
        rp_buffer = ReplayBuffer.remote(
            index=i,
            seed=seed,
            num_trans_context=variant['num_trans_context'],
            in_mdp_batch_size=variant['in_mdp_batch_size'],
        )

        buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename))
        bcq_buffers.append(rp_buffer)
    ray.get(buffer_loader_id_list)

    assert len(bcq_buffers) == len(idx_list)

    train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, )

    set_seed(variant['seed'])

    # create multi-task environment and sample tasks
    env = env_producer(variant['domain'], seed=0)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    algorithm = PEARLSoftActorCritic(env=env,
                                     train_goals=train_goals,
                                     wd_goals=wd_goals,
                                     ood_goals=ood_goals,
                                     replay_buffers=train_buffer,
                                     nets=[agent, qf1, qf2, vf],
                                     latent_dim=latent_dim,
                                     **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['domain'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
예제 #2
0
def experiment(variant):

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, qf1, qf2, vf],
        latent_dim=latent_dim,
        **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['env_name'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
예제 #3
0
def sim_policy(variant,
               path_to_exp,
               num_trajs=1,
               deterministic=False,
               save_video=False,
               animated=False):
    '''
    simulate a trained policy adapting to a new task
    optionally save videos of the trajectories - requires ffmpeg

    :variant: experiment configuration dict
    :path_to_exp: path to exp folder
    :num_trajs: number of trajectories to simulate per task (default 1)
    :deterministic: if the policy is deterministic (default stochastic)
    :save_video: whether to generate and save a video (default False)
    '''

    # create multi-task environment and sample tasks
    env = CameraWrapper(
        NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params'])),
        variant['util_params']['gpu_id'])
    if animated:
        env.render()
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    eval_tasks = list(tasks[-variant['n_eval_tasks']:])
    print('testing on {} test tasks, {} trajectories each'.format(
        len(eval_tasks), num_trajs))

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    reward_dim = 1
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=obs_dim + action_dim + reward_dim,
        output_size=context_encoder,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    # deterministic eval
    if deterministic:
        agent = MakeDeterministic(agent)

    # load trained weights (otherwise simulate random policy)
    context_encoder.load_state_dict(
        torch.load(os.path.join(path_to_exp, 'context_encoder.pth'),
                   map_location=torch.device('cpu')))
    policy.load_state_dict(
        torch.load(os.path.join(path_to_exp, 'policy.pth'),
                   map_location=torch.device('cpu')))

    # loop through tasks collecting rollouts
    all_rets = []
    video_frames = []
    for idx in eval_tasks:
        env.reset_task(idx)
        agent.clear_z()
        paths = []
        for n in range(num_trajs):
            path = rollout(
                env,
                agent,
                max_path_length=variant['algo_params']['num_steps_per_eval'],
                accum_context=True,
                animated=animated,
                save_frames=save_video)
            paths.append(path)
            if save_video:
                video_frames += [t['frame'] for t in path['env_infos']]
            if n >= variant['algo_params']['num_exp_traj_eval']:
                agent.infer_posterior(agent.context)
        all_rets.append([sum(p['rewards']) for p in paths])

    if save_video:
        # save frames to file temporarily
        temp_dir = os.path.join(path_to_exp, 'temp')
        os.makedirs(temp_dir, exist_ok=True)
        for i, frm in enumerate(video_frames):
            frm.save(os.path.join(temp_dir, '%06d.jpg' % i))

        video_filename = os.path.join(path_to_exp, 'video.mp4'.format(idx))
        # run ffmpeg to make the video
        os.system('ffmpeg -i {}/%06d.jpg -vcodec mpeg4 {}'.format(
            temp_dir, video_filename))
        # delete the frames
        shutil.rmtree(temp_dir)

    # compute average returns across tasks
    n = min([len(a) for a in all_rets])
    rets = [a[:n] for a in all_rets]
    rets = np.mean(np.stack(rets), axis=0)
    for i, ret in enumerate(rets):
        print('trajectory {}, avg return: {} \n'.format(i, ret))
예제 #4
0
def experiment(variant):
    print (variant['env_name'])
    print (variant['env_params'])
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    cont_latent_dim, num_cat, latent_dim, num_dir, dir_latent_dim = read_dim(variant['global_latent'])
    r_cont_dim, r_n_cat, r_cat_dim, r_n_dir, r_dir_dim = read_dim(variant['vrnn_latent'])
    reward_dim = 1
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    glob = variant['algo_params']['glob']
    rnn = variant['rnn']
    vrnn_latent = variant['vrnn_latent']
    encoder_model = MlpEncoder
    if recurrent:
        if variant['vrnn_constraint'] == 'logitnormal':
            output_size = r_cont_dim * 2 + r_n_cat * r_cat_dim + r_n_dir * r_dir_dim * 2
        else:
            output_size = r_cont_dim * 2 + r_n_cat * r_cat_dim + r_n_dir * r_dir_dim
        if variant['rnn_sample'] == 'batch_sampling':
            if variant['algo_params']['use_next_obs']:
                input_size = (2 * obs_dim + action_dim + reward_dim) * variant['temp_res']
            else:
                input_size = (obs_dim + action_dim + reward_dim) * variant['temp_res']
        else:
            if variant['algo_params']['use_next_obs']:
                input_size = (2 * obs_dim + action_dim + reward_dim)
            else:
                input_size = (obs_dim + action_dim + reward_dim)
        if rnn == 'rnn':
            recurrent_model = RecurrentEncoder
            recurrent_context_encoder = recurrent_model(
                hidden_sizes=[net_size, net_size, net_size],
                input_size=input_size,
                output_size = output_size
            )
        elif rnn == 'vrnn':
            recurrent_model = VRNNEncoder
            recurrent_context_encoder = recurrent_model(
                hidden_sizes=[net_size, net_size, net_size],
                input_size=input_size,
                output_size=output_size, 
                temperature=variant['temperature'],
                vrnn_latent=variant['vrnn_latent'],
                vrnn_constraint=variant['vrnn_constraint'],
                r_alpha=variant['vrnn_alpha'],
                r_var=variant['vrnn_var'],
            )

    else:
        recurrent_context_encoder = None

    ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id'])
    if glob:
        if dir_latent_dim > 0 and variant['constraint'] == 'logitnormal':
            output_size = cont_latent_dim * 2 + num_cat * latent_dim + num_dir * dir_latent_dim * 2
        else:
            output_size = cont_latent_dim * 2 + num_cat * latent_dim + num_dir * dir_latent_dim
        if variant['algo_params']['use_next_obs']:
            input_size = 2 * obs_dim + action_dim + reward_dim
        else:
            input_size = obs_dim + action_dim + reward_dim
        global_context_encoder = encoder_model(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=input_size,
            output_size=output_size, 
        )
    else:
        global_context_encoder = None      
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \
                        + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, 
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \
                        + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim,  
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \
                        + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim,  
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \
                        + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, 
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \
                        + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim, 
        latent_dim=latent_dim*num_cat + cont_latent_dim + dir_latent_dim*num_dir \
                        + r_n_cat * r_cat_dim + r_cont_dim + r_n_dir * r_dir_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(
        global_context_encoder,
        recurrent_context_encoder,
        variant['global_latent'],
        variant['vrnn_latent'],
        policy,
        variant['temperature'],
        variant['unitkl'],
        variant['alpha'],
        variant['constraint'],
        variant['vrnn_constraint'],
        variant['var'],
        variant['vrnn_alpha'],
        variant['vrnn_var'],
        rnn,
        variant['temp_res'],
        variant['rnn_sample'],
        variant['weighted_sample'],
        **variant['algo_params']
    )
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        with open(os.path.join(path, 'extra_data.pkl'), 'rb') as f:
            extra_data = pickle.load(f)
            variant['algo_params']['start_epoch'] = extra_data['epoch'] + 1
            replay_buffer = extra_data['replay_buffer']
            enc_replay_buffer = extra_data['enc_replay_buffer']
            variant['algo_params']['_n_train_steps_total'] = extra_data['_n_train_steps_total']
            variant['algo_params']['_n_env_steps_total'] = extra_data['_n_env_steps_total']
            variant['algo_params']['_n_rollouts_total'] = extra_data['_n_rollouts_total']
    else:
        replay_buffer=None
        enc_replay_buffer=None

    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, qf1, qf2, target_qf1, target_qf2],
        latent_dim=latent_dim,
        replay_buffer=replay_buffer,
        enc_replay_buffer=enc_replay_buffer,
        temp_res=variant['temp_res'],
        rnn_sample=variant['rnn_sample'],
        **variant['algo_params']
    )

    if variant['path_to_weights'] is not None: 
        path = variant['path_to_weights']
        if recurrent_context_encoder != None:
            recurrent_context_encoder.load_state_dict(torch.load(os.path.join(path, 'recurrent_context_encoder.pth')))
        if global_context_encoder != None:
            global_context_encoder.load_state_dict(torch.load(os.path.join(path, 'global_context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        target_qf1.load_state_dict(torch.load(os.path.join(path, 'target_qf1.pth')))
        target_qf2.load_state_dict(torch.load(os.path.join(path, 'target_qf2.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    if ptu.gpu_enabled():
        algorithm.to()

    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))
    exp_id = 'debug' if DEBUG else None
    if variant.get('log_name', "") == "":
        log_name = variant['env_name']
    else:
        log_name = variant['log_name']
    experiment_log_dir = setup_logger(log_name, \
                            variant=variant, \
                            exp_id=exp_id, \
                            base_log_dir=variant['util_params']['base_log_dir'], \
                            config_log_dir=variant['util_params']['config_log_dir'], \
                            log_dir=variant['util_params']['log_dir'])
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    env.save_all_tasks(experiment_log_dir)

    if variant['eval']:
        algorithm._try_to_eval(0, eval_all=True, eval_train_offline=False, animated=True)
    else:
        algorithm.train()
예제 #5
0
def setup_and_run(variant):

    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['seed'] % variant['util_params']['num_gpus'])
    #setup env
    env_name = variant['env_name']
    env_params = variant['env_params']
    env_params['n_tasks'] = variant["n_train_tasks"] + variant["n_eval_tasks"]
    env = NormalizedBoxEnv(ENVS[env_name](**env_params))

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    latent_dim = variant['latent_size']
    reward_dim = 1

    #setup encoder
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )

    #setup actor, critic
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])

    algorithm = PEARLSoftActorCritic(
        env=env,
        train_tasks=list(np.arange(variant['n_train_tasks'])),
        eval_tasks=list(
            np.arange(variant['n_train_tasks'],
                      variant['n_train_tasks'] + variant['n_eval_tasks'])),
        nets=[agent, qf1, qf2, target_qf1, target_qf2],
        latent_dim=latent_dim,
        **variant['algo_params'])
    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))

        target_qf1.load_state_dict(
            torch.load(os.path.join(path, 'target_qf1.pth')))
        target_qf2.load_state_dict(
            torch.load(os.path.join(path, 'target_qf2.pth')))

        # TODO hacky, revisit after model refactor
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    if ptu.gpu_enabled():
        algorithm.to()

    os.environ['DEBUG'] = str(int(variant['util_params']['debug']))

    #setup logger
    run_mode = variant['run_mode']
    exp_log_name = os.path.join(
        variant['env_name'], run_mode,
        variant['log_annotation'] + variant['variant_name'],
        'seed-' + str(variant['seed']))

    setup_logger(exp_log_name,
                 variant=variant,
                 exp_id=None,
                 base_log_dir=os.environ.get('PEARL_DATA_PATH'),
                 snapshot_mode='gap',
                 snapshot_gap=10)

    # run the algorithm
    if run_mode == 'TRAIN':
        algorithm.train()
    elif run_mode == 'EVAL':
        assert variant['algo_params']['dump_eval_paths'] == True
        algorithm._try_to_eval()
    else:
        algorithm.eval_with_loaded_latent()
예제 #6
0
def experiment(variant):

    # create multi-task environment and sample tasks
    env = NormalizedBoxEnv(ENVS[variant['env_name']](**variant['env_params']))
    tasks = env.get_all_task_idx()
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )

    #low Qs first and then high Qs
    q_list = [[
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=2 * obs_dim + action_dim,
            output_size=1,
        ),
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=2 * obs_dim + action_dim,
            output_size=1,
        )
    ],
              [
                  FlattenMlp(
                      hidden_sizes=[net_size, net_size, net_size],
                      input_size=obs_dim + action_dim + latent_dim,
                      output_size=1,
                  ),
                  FlattenMlp(
                      hidden_sizes=[net_size, net_size, net_size],
                      input_size=obs_dim + action_dim + latent_dim,
                      output_size=1,
                  )
              ]]
    #low vf first and then high vf
    vf_list = [
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=2 * obs_dim,
            output_size=1,
        ),
        FlattenMlp(
            hidden_sizes=[net_size, net_size, net_size],
            input_size=obs_dim + latent_dim,
            output_size=1,
        )
    ]

    #NOTE: Reduced number of hidden layers in h_policy from 3 to 2 (idea being it's not doing as much as the whole policy in PEARL)
    h_policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=obs_dim,
    )
    #NOTE: Kept the 3 layers because f**k it it'll get tons of data
    l_policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size, net_size],
        obs_dim=2 * obs_dim,
        latent_dim=0,
        action_dim=action_dim,
    )
    #TODO Implement BernAgent
    agent = BURNAgent(latent_dim,
                      context_encoder,
                      h_policy,
                      l_policy,
                      c=2,
                      **variant['algo_params'])
    algorithm = BURNSoftActorCritic(
        env=env,
        train_tasks=list(tasks[:variant['n_train_tasks']]),
        eval_tasks=list(tasks[-variant['n_eval_tasks']:]),
        nets=[agent, q_list, vf_list],
        latent_dim=latent_dim,
        **variant['algo_params'])

    # optionally load pre-trained weights
    #TODO Make sure weights are properly saved
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        q_list[0][0].load_state_dict(
            torch.load(os.path.join(path, 'l_qf1.pth')))
        q_list[0][1].load_state_dict(
            torch.load(os.path.join(path, 'l_qf2.pth')))
        q_list[1][0].load_state_dict(
            torch.load(os.path.join(path, 'h_qf1.pth')))
        q_list[1][1].load_state_dict(
            torch.load(os.path.join(path, 'h_qf2.pth')))
        vf_list[0].load_state_dict(torch.load(os.path.join(path, 'l_vf.pth')))
        vf_list[1].load_state_dict(torch.load(os.path.join(path, 'h_vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        h_policy.load_state_dict(torch.load(os.path.join(path,
                                                         'h_policy.pth')))
        l_policy.load_state_dict(torch.load(os.path.join(path,
                                                         'l_policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['env_name'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
예제 #7
0
def main(
        env_name,
        seed,
        deterministic,
        traj_prior,
        start_ft_after,
        ft_steps,
        avoid_freezing_z,
        lr,
        batch_size,
        avoid_loading_critics
):
    config = "configs/{}.json".format(env_name)
    variant = default_config
    if config:
        with open(osp.join(config)) as f:
            exp_params = json.load(f)
        variant = deep_update_dict(exp_params, variant)

    exp_name = variant['env_name']
    print("Experiment: {}".format(exp_name))

    env = NormalizedBoxEnv(ENVS[exp_name](**variant['env_params']))
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    print("Observation space:")
    print(env.observation_space)
    print(obs_dim)
    print("Action space:")
    print(env.action_space)
    print(action_dim)
    print("-" * 10)

    # instantiate networks
    latent_dim = variant['latent_size']
    reward_dim = 1
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant['algo_params']['use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params']['use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    target_qf1 = qf1.copy()
    target_qf2 = qf2.copy()
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(
        latent_dim,
        context_encoder,
        policy,
        **variant['algo_params']
    )

    # deterministic eval
    if deterministic:
        agent = MakeDeterministic(agent)

    # load trained weights (otherwise simulate random policy)
    path_to_exp = "output/{}/pearl_{}".format(env_name, seed-1)
    print("Based on experiment: {}".format(path_to_exp))
    context_encoder.load_state_dict(torch.load(os.path.join(path_to_exp, 'context_encoder.pth')))
    policy.load_state_dict(torch.load(os.path.join(path_to_exp, 'policy.pth')))
    if not avoid_loading_critics:
        qf1.load_state_dict(torch.load(os.path.join(path_to_exp, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path_to_exp, 'qf2.pth')))
        target_qf1.load_state_dict(torch.load(os.path.join(path_to_exp, 'target_qf1.pth')))
        target_qf2.load_state_dict(torch.load(os.path.join(path_to_exp, 'target_qf2.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'], variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        agent.to(device)
        policy.to(device)
        context_encoder.to(device)
        qf1.to(device)
        qf2.to(device)
        target_qf1.to(device)
        target_qf2.to(device)

    helper = PEARLFineTuningHelper(
        env=env,
        agent=agent,
        qf1=qf1,
        qf2=qf2,
        target_qf1=target_qf1,
        target_qf2=target_qf2,

        num_exp_traj_eval=traj_prior,
        start_fine_tuning=start_ft_after,
        fine_tuning_steps=ft_steps,
        should_freeze_z=(not avoid_freezing_z),

        replay_buffer_size=int(1e6),
        batch_size=batch_size,
        discount=0.99,
        policy_lr=lr,
        qf_lr=lr,
        temp_lr=lr,
        target_entropy=-action_dim,
    )

    helper.fine_tune(variant=variant, seed=seed)