def __init__(self, index, variant, candidate_size=10):
        ptu.set_gpu_mode(True)
        torch.set_num_threads(1)

        import sys
        sys.argv = ['']
        del sys

        env_max_action = variant['env_max_action']
        obs_dim = variant['obs_dim']
        action_dim = variant['action_dim']
        latent_dim = variant['latent_dim']
        vae_latent_dim = 2 * action_dim
        mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[
            'use_next_obs_in_context'] else obs_dim + action_dim + 1

        mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200],
                                  input_size=mlp_enconder_input_size,
                                  output_size=2 * variant['latent_dim'])
        self.context_encoder = ProbabilisticContextEncoder(
            mlp_enconder, variant['latent_dim'])
        self.Qs = FlattenMlp(
            hidden_sizes=variant['Qs_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=1,
        )
        self.vae_decoder = VaeDecoder(
            max_action=variant['env_max_action'],
            hidden_sizes=variant['vae_hidden_sizes'],
            input_size=obs_dim + vae_latent_dim + latent_dim,
            output_size=action_dim,
        )
        self.perturbation_generator = PerturbationGenerator(
            max_action=env_max_action,
            hidden_sizes=variant['perturbation_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=action_dim,
        )

        self.use_next_obs_in_context = variant['use_next_obs_in_context']

        self.env = env_producer(variant['domain'], variant['seed'])
        self.num_evals = variant['num_evals']
        self.max_path_length = variant['max_path_length']

        self.vae_latent_dim = vae_latent_dim
        self.candidate_size = variant['candidate_size']

        self.env.seed(10 * variant['seed'] + 1234 + index)
        set_seed(10 * variant['seed'] + 1234 + index)

        self.env.action_space.np_random.seed(123 + index)
Exemplo n.º 2
0
    def __init__(self, index, variant, candidate_size=10):
        ptu.set_gpu_mode(True)
        torch.set_num_threads(1)

        import sys
        sys.argv = ['']
        del sys

        env_max_action = variant['env_max_action']
        obs_dim = variant['obs_dim']
        action_dim = variant['action_dim']
        latent_dim = variant['latent_dim']
        vae_latent_dim = 2 * action_dim

        self.f = MlpEncoder(
            g_hidden_sizes=variant['g_hidden_sizes'],
            g_input_sizes=obs_dim + action_dim + 1,
            g_latent_dim=variant['g_latent_dim'],
            h_hidden_sizes=variant['h_hidden_sizes'],
            latent_dim=latent_dim,
        )
        self.Qs = FlattenMlp(
            hidden_sizes=variant['Qs_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=1,
        )
        self.vae_decoder = VaeDecoder(
            max_action=variant['env_max_action'],
            hidden_sizes=variant['vae_hidden_sizes'],
            input_size=obs_dim + vae_latent_dim + latent_dim,
            output_size=action_dim,
        )
        self.perturbation_generator = PerturbationGenerator(
            max_action=env_max_action,
            hidden_sizes=variant['perturbation_hidden_sizes'],
            input_size=obs_dim + action_dim + latent_dim,
            output_size=action_dim,
        )

        self.env = env_producer(variant['domain'], variant['seed'])
        self.num_evals = variant['algo_params']['num_evals']
        self.max_path_length = variant['max_path_length']

        self.vae_latent_dim = vae_latent_dim
        self.num_trans_context = variant['num_trans_context']
        self.candidate_size = variant['candidate_size']
        self.seed = variant['seed']
        self.index = index

        self.env.seed(10 * self.seed + 1234 + index)
        set_seed(10 * self.seed + 1234 + index)
Exemplo n.º 3
0
def experiment(variant, bcq_buffers, prev_exp_state=None):
    # Create the multitask replay buffer based on the buffer list
    train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, )
    # create multi-task environment and sample tasks
    env = env_producer(variant['domain'], variant['seed'])
    env.reset()

    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))

    # instantiate networks
    network_ensemble = []
    for _ in range(variant['num_network_ensemble']):
        P = FlattenMlp(
            hidden_sizes=variant['P_hidden_sizes'],
            input_size=obs_dim + action_dim,
            output_size=1,
        )
        network_ensemble.append(P)

    trainer = SuperQTrainer(
        env,
        network_ensemble=network_ensemble,
        train_goal=variant['train_goal'],
        std_threshold=variant['std_threshold'],
        domain=variant['domain'],
    )

    algorithm = BatchMetaRLAlgorithm(
        trainer,
        train_buffer,
        **variant['algo_params'],
    )

    algorithm.to(ptu.device)

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)
Exemplo n.º 4
0
    def __init__(self, variant, goal, candidate_size=10):
        ptu.set_gpu_mode(True)
        torch.set_num_threads(1)

        import sys
        sys.argv = ['']
        del sys

        self.env = env_producer(variant['env_name'], seed=0, goal=goal)
        obs_dim = int(np.prod(self.env.observation_space.shape))
        action_dim = int(np.prod(self.env.action_space.shape))
        reward_dim = 1

        # instantiate networks
        latent_dim = variant['latent_size']
        context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
            'algo_params'][
                'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
        context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
            'use_information_bottleneck'] else latent_dim
        net_size = variant['net_size']
        recurrent = variant['algo_params']['recurrent']
        encoder_model = RecurrentEncoder if recurrent else MlpEncoder

        context_encoder = encoder_model(
            hidden_sizes=[200, 200, 200],
            input_size=context_encoder_input_dim,
            output_size=context_encoder_output_dim,
        )

        policy = TanhGaussianPolicy(
            hidden_sizes=[net_size, net_size, net_size],
            obs_dim=obs_dim + latent_dim,
            latent_dim=latent_dim,
            action_dim=action_dim,
        )
        self.agent = PEARLAgent(latent_dim, context_encoder, policy,
                                **variant['algo_params'])
        self.num_evals = variant['num_evals']
        self.max_path_length = variant['max_path_length']
Exemplo n.º 5
0
    def __init__(self, index, variant, candidate_size=10):
        ptu.set_gpu_mode(True)
        torch.set_num_threads(1)

        import sys
        sys.argv = ['']
        del sys

        self.env = env_producer(variant['domain'], variant['seed'])
        state_dim = self.env.observation_space.low.size
        action_dim = self.env.action_space.low.size
        max_action = float(self.env.action_space.high[0])

        self.policy = BCQ(state_dim, action_dim, max_action,
                          **variant['policy_params'])
        self.num_evals = variant['num_evals']
        self.max_path_length = variant['max_path_length']
        self.seed = variant['seed']
        self.index = index

        self.env.seed(10 * self.seed + 1234 + index)
        set_seed(10 * self.seed + 1234 + index)
Exemplo n.º 6
0
    def __init__(
        self,
        domain_name,
        env_seed,
        policy_producer,
        max_num_epoch_paths_saved=None,
        render=False,
        render_kwargs=None,
    ):

        torch.set_num_threads(1)

        env = env_producer(domain_name, env_seed)

        self._policy_producer = policy_producer

        super().__init__(
            env,
            max_num_epoch_paths_saved=max_num_epoch_paths_saved,
            render=render,
            render_kwargs=render_kwargs,
        )
Exemplo n.º 7
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']
    goal = variant['goal']

    expl_env = env_producer(domain, seed, goal)

    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    print('------------------------------------------------')
    print('obs_dim', obs_dim)
    print('action_dim', action_dim)
    print('------------------------------------------------')

    # Get producer function for policy and value functions
    M = variant['layer_size']

    q_producer = get_q_producer(
        obs_dim,
        action_dim,
        hidden_sizes=[1024, 1024, 1024, 1024, 1024, 1024, 1024])
    policy_producer = get_policy_producer(obs_dim,
                                          action_dim,
                                          hidden_sizes=[M, M])
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, goal, policy_producer)

    expl_path_collector = MdpPathCollector(expl_env, )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_space=expl_env.observation_space,
                                 action_space=expl_env.action_space)
    trainer = SACTrainer(policy_producer,
                         q_producer,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    if prev_exp_state is not None:

        expl_path_collector.restore_from_snapshot(
            prev_exp_state['exploration'])

        ray.get([
            remote_eval_path_collector.restore_from_snapshot.remote(
                prev_exp_state['evaluation_remote'])
        ])
        ray.get([
            remote_eval_path_collector.set_global_pkg_rng_state.remote(
                prev_exp_state['evaluation_remote_rng_state'])
        ])

        replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer'])

        trainer.restore_from_snapshot(prev_exp_state['trainer'])

        set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state'])

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)
Exemplo n.º 8
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']
    goal = variant['goal']

    expl_env = env_producer(domain, seed, goal)

    env_max_action = float(expl_env.action_space.high[0])
    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size
    vae_latent_dim = 2 * action_dim
    mlp_enconder_input_size = 2 * obs_dim + action_dim + 1

    print('------------------------------------------------')
    print('obs_dim', obs_dim)
    print('action_dim', action_dim)
    print('------------------------------------------------')

    # Network module from tiMe

    mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200],
                              input_size=mlp_enconder_input_size,
                              output_size=2 * variant['latent_dim'])

    context_encoder = ProbabilisticContextEncoder(mlp_enconder,
                                                  variant['latent_dim'])

    qf1 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=1,
    )
    vae_decoder = VaeDecoder(
        max_action=env_max_action,
        hidden_sizes=variant['vae_hidden_sizes'],
        input_size=obs_dim + vae_latent_dim + variant['latent_dim'],
        output_size=action_dim,
    )
    perturbation_generator = PerturbationGenerator(
        max_action=env_max_action,
        hidden_sizes=variant['perturbation_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=action_dim,
    )

    # Load the params obtained by tiMe
    ss = load_gzip_pickle(variant['path_to_snapshot'])
    ss = ss['trainer']

    encoder_state_dict = OrderedDict()
    for key, value in ss['context_encoder_state_dict'].items():
        if 'mlp_encoder' in key:
            encoder_state_dict[key.replace('mlp_encoder.', '')] = value

    mlp_enconder.load_state_dict(encoder_state_dict)

    qf1.load_state_dict(ss['Qs_state_dict'])

    target_qf1.load_state_dict(ss['Qs_state_dict'])

    qf2.load_state_dict(ss['Qs_state_dict'])

    target_qf2.load_state_dict(ss['Qs_state_dict'])

    vae_decoder.load_state_dict(ss['vae_decoder_state_dict'])

    perturbation_generator.load_state_dict(ss['perturbation_generator_dict'])

    tiMe_path_collector = tiMeSampler(
        expl_env,
        context_encoder,
        qf1,
        vae_decoder,
        perturbation_generator,
        vae_latent_dim=vae_latent_dim,
        candidate_size=variant['candidate_size'],
    )
    tiMe_path_collector.to(ptu.device)

    # Get producer function for policy
    policy_producer = get_policy_producer(
        obs_dim, action_dim, hidden_sizes=variant['policy_hidden_sizes'])
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, goal, policy_producer)
    expl_path_collector = MdpPathCollector(expl_env, )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_space=expl_env.observation_space,
                                 action_space=expl_env.action_space)
    trainer = SACTrainer(policy_producer,
                         qf1=qf1,
                         target_qf1=target_qf1,
                         qf2=qf2,
                         target_qf2=target_qf2,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        tiMe_data_collector=tiMe_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)
Exemplo n.º 9
0
# wd_goals = np.random.uniform(0, 1, size=(8,)) * np.pi * 2 / 3
# ood_goals = np.random.uniform(2 / 3, 1.0, size=(8,)) * np.pi

# idx_list = [0, 1, 4, 10, 12, 14, 17, 21, 26, 27]
# train_goals = train_goals[idx_list]

# filename = './goals/humanoid-openai-dir-normal-goals.pkl'
# with open(filename, 'wb') as f:
#     pickle.dump([idx_list, train_goals, wd_goals, ood_goals], f)

# print([idx_list, train_goals, wd_goals, ood_goals])

# #---------------------Walker-Param-Normal-------------------------

sample_env = env_producer('walker-param', 0)
train_goals = sample_env.sample_tasks(30,
                                      is_train=True,
                                      is_within_distribution=True)
wd_goals = sample_env.sample_tasks(8,
                                   is_train=False,
                                   is_within_distribution=True)
ood_goals = sample_env.sample_tasks(8,
                                    is_train=False,
                                    is_within_distribution=False)

idx_list = list(range(30))
train_goals = [train_goals[idx] for idx in idx_list]

filename = './goals/walker-param-normal-goals.pkl'
with open(filename, 'wb') as f:
Exemplo n.º 10
0
def experiment(variant):

    domain = variant['domain']
    seed = variant['seed']
    exp_mode = variant['exp_mode']
    max_path_length = variant['algo_params']['max_path_length']
    bcq_interactions = variant['bcq_interactions']
    num_tasks = variant['num_tasks']

    filename = f'./goals/{domain}-{exp_mode}-goals.pkl'
    idx_list, train_goals, wd_goals, ood_goals = pickle.load(
        open(filename, 'rb'))
    idx_list = idx_list[:num_tasks]

    sub_buffer_dir = f"buffers/{domain}/{exp_mode}/max_path_length_{max_path_length}/interactions_{bcq_interactions}k/seed_{seed}"
    buffer_dir = os.path.join(variant['data_models_root'], sub_buffer_dir)

    print("Buffer directory: " + buffer_dir)

    # Load buffer
    bcq_buffers = []

    buffer_loader_id_list = []
    for i, idx in enumerate(idx_list):
        bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl'
        filename = os.path.join(buffer_dir, bname)
        rp_buffer = ReplayBuffer.remote(
            index=i,
            seed=seed,
            num_trans_context=variant['num_trans_context'],
            in_mdp_batch_size=variant['in_mdp_batch_size'],
        )

        buffer_loader_id_list.append(rp_buffer.load_from_gzip.remote(filename))
        bcq_buffers.append(rp_buffer)
    ray.get(buffer_loader_id_list)

    assert len(bcq_buffers) == len(idx_list)

    train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, )

    set_seed(variant['seed'])

    # create multi-task environment and sample tasks
    env = env_producer(variant['domain'], seed=0)
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    reward_dim = 1

    # instantiate networks
    latent_dim = variant['latent_size']
    context_encoder_input_dim = 2 * obs_dim + action_dim + reward_dim if variant[
        'algo_params'][
            'use_next_obs_in_context'] else obs_dim + action_dim + reward_dim
    context_encoder_output_dim = latent_dim * 2 if variant['algo_params'][
        'use_information_bottleneck'] else latent_dim
    net_size = variant['net_size']
    recurrent = variant['algo_params']['recurrent']
    encoder_model = RecurrentEncoder if recurrent else MlpEncoder

    context_encoder = encoder_model(
        hidden_sizes=[200, 200, 200],
        input_size=context_encoder_input_dim,
        output_size=context_encoder_output_dim,
    )
    qf1 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + action_dim + latent_dim,
        output_size=1,
    )
    vf = FlattenMlp(
        hidden_sizes=[net_size, net_size, net_size],
        input_size=obs_dim + latent_dim,
        output_size=1,
    )
    policy = TanhGaussianPolicy(
        hidden_sizes=[net_size, net_size, net_size],
        obs_dim=obs_dim + latent_dim,
        latent_dim=latent_dim,
        action_dim=action_dim,
    )
    agent = PEARLAgent(latent_dim, context_encoder, policy,
                       **variant['algo_params'])
    algorithm = PEARLSoftActorCritic(env=env,
                                     train_goals=train_goals,
                                     wd_goals=wd_goals,
                                     ood_goals=ood_goals,
                                     replay_buffers=train_buffer,
                                     nets=[agent, qf1, qf2, vf],
                                     latent_dim=latent_dim,
                                     **variant['algo_params'])

    # optionally load pre-trained weights
    if variant['path_to_weights'] is not None:
        path = variant['path_to_weights']
        context_encoder.load_state_dict(
            torch.load(os.path.join(path, 'context_encoder.pth')))
        qf1.load_state_dict(torch.load(os.path.join(path, 'qf1.pth')))
        qf2.load_state_dict(torch.load(os.path.join(path, 'qf2.pth')))
        vf.load_state_dict(torch.load(os.path.join(path, 'vf.pth')))
        # TODO hacky, revisit after model refactor
        algorithm.networks[-2].load_state_dict(
            torch.load(os.path.join(path, 'target_vf.pth')))
        policy.load_state_dict(torch.load(os.path.join(path, 'policy.pth')))

    # optional GPU mode
    ptu.set_gpu_mode(variant['util_params']['use_gpu'],
                     variant['util_params']['gpu_id'])
    if ptu.gpu_enabled():
        algorithm.to()

    # debugging triggers a lot of printing and logs to a debug directory
    DEBUG = variant['util_params']['debug']
    os.environ['DEBUG'] = str(int(DEBUG))

    # create logging directory
    # TODO support Docker
    exp_id = 'debug' if DEBUG else None
    experiment_log_dir = setup_logger(
        variant['domain'],
        variant=variant,
        exp_id=exp_id,
        base_log_dir=variant['util_params']['base_log_dir'])

    # optionally save eval trajectories as pkl files
    if variant['algo_params']['dump_eval_paths']:
        pickle_dir = experiment_log_dir + '/eval_trajectories'
        pathlib.Path(pickle_dir).mkdir(parents=True, exist_ok=True)

    # run the algorithm
    algorithm.train()
Exemplo n.º 11
0
    return timestamp


std_threshold = 0.1
in_mdp_batch_size = 128

eval_statistics = OrderedDict()

logger.reset()
setup_logger(
    log_dir=osp.join('./tune_threshold_loggings', create_simple_exp_name()))

filename = f'./goals/ant-dir-normal-goals.pkl'
train_goals, wd_goals, ood_goals = pickle.load(open(filename, 'rb'))

env = env_producer('ant-dir', 0, train_goals[0])

for epoch in range(200):

    file_name = osp.join('./data_reward_predictions', f'params_{epoch}.pkl')
    params = pickle.load(open(file_name, "rb"))

    obs = params['obs']
    actions = params['actions']
    rewards = params['rewards']
    pred_rewards = params['pred_rewards']

    obs_other_tasks = [
        obs[in_mdp_batch_size * i:in_mdp_batch_size * (i + 1)]
        for i in range(1, 32)
    ]
Exemplo n.º 12
0
def experiment(variant,
               bcq_policies,
               bcq_buffers,
               ensemble_params_list,
               prev_exp_state=None):
    # Create the multitask replay buffer based on the buffer list
    train_buffer = MultiTaskReplayBuffer(bcq_buffers_list=bcq_buffers, )
    # create multi-task environment and sample tasks
    env = env_producer(variant['domain'], variant['seed'])

    env_max_action = float(env.action_space.high[0])
    obs_dim = int(np.prod(env.observation_space.shape))
    action_dim = int(np.prod(env.action_space.shape))
    vae_latent_dim = 2 * action_dim
    mlp_enconder_input_size = 2 * obs_dim + action_dim + 1 if variant[
        'use_next_obs_in_context'] else obs_dim + action_dim + 1

    variant['env_max_action'] = env_max_action
    variant['obs_dim'] = obs_dim
    variant['action_dim'] = action_dim

    variant['mlp_enconder_input_size'] = mlp_enconder_input_size

    # instantiate networks

    mlp_enconder = MlpEncoder(hidden_sizes=[200, 200, 200],
                              input_size=mlp_enconder_input_size,
                              output_size=2 * variant['latent_dim'])
    context_encoder = ProbabilisticContextEncoder(mlp_enconder,
                                                  variant['latent_dim'])

    ensemble_predictor = EnsemblePredictor(ensemble_params_list)

    Qs = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=1,
    )
    vae_decoder = VaeDecoder(
        max_action=env_max_action,
        hidden_sizes=variant['vae_hidden_sizes'],
        input_size=obs_dim + vae_latent_dim + variant['latent_dim'],
        output_size=action_dim,
    )
    perturbation_generator = PerturbationGenerator(
        max_action=env_max_action,
        hidden_sizes=variant['perturbation_hidden_sizes'],
        input_size=obs_dim + action_dim + variant['latent_dim'],
        output_size=action_dim,
    )
    trainer = SuperQTrainer(
        ensemble_predictor=ensemble_predictor,
        num_network_ensemble=variant['num_network_ensemble'],
        bcq_policies=bcq_policies,
        std_threshold=variant['std_threshold'],
        is_combine=variant['is_combine'],
        nets=[context_encoder, Qs, vae_decoder, perturbation_generator])

    path_collector = RemotePathCollector(variant)

    algorithm = BatchMetaRLAlgorithm(
        trainer,
        path_collector,
        train_buffer,
        **variant['algo_params'],
    )

    algorithm.to(ptu.device)

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    # Log the variant
    logger.log("Variant:")
    logger.log(json.dumps(dict_to_safe_json(variant), indent=2))

    algorithm.train(start_epoch)
Exemplo n.º 13
0
    # set up logger
    variant['log_dir'] = get_log_dir(variant)
    logger.reset()
    setup_logger(log_dir=variant['log_dir'],
                 snapshot_gap=100,
                 snapshot_mode="gap")

    logger.log(f'Seed: {seed}')
    set_seed(seed)

    logger.log(f'Using GPU: {True}')
    set_gpu_mode(mode=True, gpu_id=0)

    # Get the information of the environment
    env = env_producer(domain, seed)

    state_dim = env.observation_space.low.size
    action_dim = env.action_space.low.size
    max_action = float(env.action_space.high[0])

    # Load buffer
    bcq_buffers = []

    buffer_loader_id_list = []
    for i, idx in enumerate(idx_list):
        bname = f'goal_0{idx}.zip_pkl' if idx < 10 else f'goal_{idx}.zip_pkl'
        filename = os.path.join(buffer_dir, bname)
        rp_buffer = ReplayBuffer.remote(
            index=i,
            seed=seed,
Exemplo n.º 14
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']

    expl_env = env_producer(domain, seed)

    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    obs_dim, action_dim = {
        'GridGoal1': (2, 2),
        'GridGoal2': (2, 2),
        'GridGoal3': (2, 2),
        'AntEscape': (29, 8),
        'AntJump': (29, 8),
        'AntNavigate': (29, 8),
        'HumanoidUp': (47, 17)
    }[domain]

    # Get producer function for policy and value functions
    M = variant['layer_size']

    q_producer = get_q_producer(obs_dim, action_dim, hidden_sizes=[M, M])
    policy_producer = get_policy_producer(obs_dim,
                                          action_dim,
                                          hidden_sizes=[M, M])
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, policy_producer)

    expl_path_collector = MdpPathCollector(expl_env, )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_dim=obs_dim,
                                 ac_dim=action_dim)

    trainer = SACTrainer(policy_producer,
                         q_producer,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        log_dir=variant['log_dir'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    if prev_exp_state is not None:

        expl_path_collector.restore_from_snapshot(
            prev_exp_state['exploration'])

        ray.get([
            remote_eval_path_collector.restore_from_snapshot.remote(
                prev_exp_state['evaluation_remote'])
        ])
        ray.get([
            remote_eval_path_collector.set_global_pkg_rng_state.remote(
                prev_exp_state['evaluation_remote_rng_state'])
        ])

        replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer'])

        trainer.restore_from_snapshot(prev_exp_state['trainer'])

        set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state'])

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)
Exemplo n.º 15
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']
    goal = variant['goal']

    expl_env = env_producer(domain, seed, goal)

    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    print('------------------------------------------------')
    print('obs_dim', obs_dim)
    print('action_dim', action_dim)
    print('------------------------------------------------')

    qf1 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    target_qf1 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    qf2 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim,
        output_size=1,
    )
    target_qf2 = FlattenMlp(
        hidden_sizes=variant['Qs_hidden_sizes'],
        input_size=obs_dim + action_dim,
        output_size=1,
    )

    # Get producer function for policy
    policy_producer = get_policy_producer(
        obs_dim, action_dim, hidden_sizes=variant['policy_hidden_sizes'])
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, goal, policy_producer)
    expl_path_collector = MdpPathCollector(expl_env, )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_space=expl_env.observation_space,
                                 action_space=expl_env.action_space)
    trainer = SACTrainer(policy_producer,
                         qf1=qf1,
                         target_qf1=target_qf1,
                         qf2=qf2,
                         target_qf2=target_qf2,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)