Exemplo n.º 1
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']
    goal = variant['goal']

    expl_env = env_producer(domain, seed, goal)

    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    print('------------------------------------------------')
    print('obs_dim', obs_dim)
    print('action_dim', action_dim)
    print('------------------------------------------------')

    # Get producer function for policy and value functions
    M = variant['layer_size']

    q_producer = get_q_producer(
        obs_dim,
        action_dim,
        hidden_sizes=[1024, 1024, 1024, 1024, 1024, 1024, 1024])
    policy_producer = get_policy_producer(obs_dim,
                                          action_dim,
                                          hidden_sizes=[M, M])
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, goal, policy_producer)

    expl_path_collector = MdpPathCollector(expl_env, )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_space=expl_env.observation_space,
                                 action_space=expl_env.action_space)
    trainer = SACTrainer(policy_producer,
                         q_producer,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    if prev_exp_state is not None:

        expl_path_collector.restore_from_snapshot(
            prev_exp_state['exploration'])

        ray.get([
            remote_eval_path_collector.restore_from_snapshot.remote(
                prev_exp_state['evaluation_remote'])
        ])
        ray.get([
            remote_eval_path_collector.set_global_pkg_rng_state.remote(
                prev_exp_state['evaluation_remote_rng_state'])
        ])

        replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer'])

        trainer.restore_from_snapshot(prev_exp_state['trainer'])

        set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state'])

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)
 def set_global_pkg_rng_state(self, state):
     set_global_pkg_rng_state(state)
Exemplo n.º 3
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']
    num_parallel = variant['num_parallel']
    custom_initialization = variant['custom_initialization']

    expl_env = parallel_gibson_env_producer(num_env=num_parallel)
    #expl_env = parallel_gibson_stadium_env_producer(num_env=num_parallel)

    #obs_dim = expl_env.observation_space.low.size
    observation_space = expl_env.observation_space
    action_dim = expl_env.action_space.low.size

    # Get producer function for policy and value functions
    q_producer = get_q_producer(observation_space, action_dim,
                                custom_initialization)
    policy_producer = get_policy_producer(observation_space, action_dim,
                                          custom_initialization)
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, policy_producer, max_num_epoch_paths_saved=1)

    expl_path_collector = MdpPathCollector(
        expl_env,
        max_num_epoch_paths_saved=1,
    )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_space=expl_env.observation_space,
                                 action_space=expl_env.action_space)
    trainer = SACTrainer(policy_producer,
                         q_producer,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    if prev_exp_state is not None:

        expl_path_collector.restore_from_snapshot(
            prev_exp_state['exploration'])

        ray.get([
            remote_eval_path_collector.restore_from_snapshot.remote(
                prev_exp_state['evaluation_remote'])
        ])
        ray.get([
            remote_eval_path_collector.set_global_pkg_rng_state.remote(
                prev_exp_state['evaluation_remote_rng_state'])
        ])

        replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer'])

        trainer.restore_from_snapshot(prev_exp_state['trainer'])

        set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state'])

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)
Exemplo n.º 4
0
def experiment(variant, prev_exp_state=None):

    domain = variant['domain']
    seed = variant['seed']

    expl_env = env_producer(domain, seed)

    obs_dim = expl_env.observation_space.low.size
    action_dim = expl_env.action_space.low.size

    obs_dim, action_dim = {
        'GridGoal1': (2, 2),
        'GridGoal2': (2, 2),
        'GridGoal3': (2, 2),
        'AntEscape': (29, 8),
        'AntJump': (29, 8),
        'AntNavigate': (29, 8),
        'HumanoidUp': (47, 17)
    }[domain]

    # Get producer function for policy and value functions
    M = variant['layer_size']

    q_producer = get_q_producer(obs_dim, action_dim, hidden_sizes=[M, M])
    policy_producer = get_policy_producer(obs_dim,
                                          action_dim,
                                          hidden_sizes=[M, M])
    # Finished getting producer

    remote_eval_path_collector = RemoteMdpPathCollector.remote(
        domain, seed * 10 + 1, policy_producer)

    expl_path_collector = MdpPathCollector(expl_env, )
    replay_buffer = ReplayBuffer(variant['replay_buffer_size'],
                                 ob_dim=obs_dim,
                                 ac_dim=action_dim)

    trainer = SACTrainer(policy_producer,
                         q_producer,
                         action_space=expl_env.action_space,
                         **variant['trainer_kwargs'])

    algorithm = BatchRLAlgorithm(
        trainer=trainer,
        exploration_data_collector=expl_path_collector,
        remote_eval_data_collector=remote_eval_path_collector,
        replay_buffer=replay_buffer,
        optimistic_exp_hp=variant['optimistic_exp'],
        log_dir=variant['log_dir'],
        **variant['algorithm_kwargs'])

    algorithm.to(ptu.device)

    if prev_exp_state is not None:

        expl_path_collector.restore_from_snapshot(
            prev_exp_state['exploration'])

        ray.get([
            remote_eval_path_collector.restore_from_snapshot.remote(
                prev_exp_state['evaluation_remote'])
        ])
        ray.get([
            remote_eval_path_collector.set_global_pkg_rng_state.remote(
                prev_exp_state['evaluation_remote_rng_state'])
        ])

        replay_buffer.restore_from_snapshot(prev_exp_state['replay_buffer'])

        trainer.restore_from_snapshot(prev_exp_state['trainer'])

        set_global_pkg_rng_state(prev_exp_state['global_pkg_rng_state'])

    start_epoch = prev_exp_state['epoch'] + \
        1 if prev_exp_state is not None else 0

    algorithm.train(start_epoch)