示例#1
0
def nevergrad(env: CompilerEnv,
              optimization_target: OptimizationTarget,
              search_time_seconds: int,
              seed: int,
              episode_length: int = 100,
              optimizer: str = "DiscreteLenglerOnePlusOne",
              **kwargs) -> None:
    """Optimize an environment using nevergrad.

    Nevergrad is a gradient-free optimization platform that provides
    implementations of various black box optimizations techniques:

        https://facebookresearch.github.io/nevergrad/
    """
    if optimization_target == OptimizationTarget.RUNTIME:

        def calculate_negative_reward(actions: Tuple[ActionType]) -> float:
            env.reset()
            env.multistep(actions)
            return -env.episode_reward

    else:
        # Only cache the deterministic non-runtime rewards.
        @lru_cache(maxsize=int(1e4))
        def calculate_negative_reward(actions: Tuple[ActionType]) -> float:
            env.reset()
            env.multistep(actions)
            return -env.episode_reward

    params = ng.p.Choice(
        choices=range(env.action_space.n),
        repetitions=episode_length,
        deterministic=True,
    )
    params.random_state.seed(seed)

    optimizer_class = getattr(ng.optimizers, optimizer)
    optimizer = optimizer_class(parametrization=params,
                                budget=1,
                                num_workers=1)

    end_time = time() + search_time_seconds
    while time() < end_time:
        x = optimizer.ask()
        optimizer.tell(x, calculate_negative_reward(x.value))

    # Get best solution and replay it.
    recommendation = optimizer.provide_recommendation()
    env.reset()
    env.multistep(recommendation.value)
示例#2
0
def run_one_trial(env: CompilerEnv, reward_space: str, min_steps: int,
                  max_steps: int) -> Optional[float]:
    """Run a random number of random steps in an environment and return the
    cumulative reward.

    :return: A cumulative reward.
    """
    num_steps = random.randint(min_steps, max_steps)
    warmup_actions = [env.action_space.sample() for _ in range(num_steps)]
    env.reward_space = reward_space
    _, _, done, _ = env.multistep(warmup_actions)
    if done:
        return None
    return env.episode_reward
def run_one_trial(
    env: CompilerEnv, reward_space: str, action: int, max_warmup_steps: int
) -> Optional[float]:
    """Run a random number of "warmup" steps in an environment, then compute
    the immediate reward of the given action.

    :return: An immediate reward.
    """
    num_warmup_steps = random.randint(0, max_warmup_steps)
    warmup_actions = [env.action_space.sample() for _ in range(num_warmup_steps)]
    env.reward_space = reward_space
    _, _, done, _ = env.multistep(warmup_actions)
    if done:
        return None
    _, (reward,), done, _ = env.step(action, reward_spaces=[reward_space])
    return None if done else reward