def nevergrad(env: CompilerEnv, optimization_target: OptimizationTarget, search_time_seconds: int, seed: int, episode_length: int = 100, optimizer: str = "DiscreteLenglerOnePlusOne", **kwargs) -> None: """Optimize an environment using nevergrad. Nevergrad is a gradient-free optimization platform that provides implementations of various black box optimizations techniques: https://facebookresearch.github.io/nevergrad/ """ if optimization_target == OptimizationTarget.RUNTIME: def calculate_negative_reward(actions: Tuple[ActionType]) -> float: env.reset() env.multistep(actions) return -env.episode_reward else: # Only cache the deterministic non-runtime rewards. @lru_cache(maxsize=int(1e4)) def calculate_negative_reward(actions: Tuple[ActionType]) -> float: env.reset() env.multistep(actions) return -env.episode_reward params = ng.p.Choice( choices=range(env.action_space.n), repetitions=episode_length, deterministic=True, ) params.random_state.seed(seed) optimizer_class = getattr(ng.optimizers, optimizer) optimizer = optimizer_class(parametrization=params, budget=1, num_workers=1) end_time = time() + search_time_seconds while time() < end_time: x = optimizer.ask() optimizer.tell(x, calculate_negative_reward(x.value)) # Get best solution and replay it. recommendation = optimizer.provide_recommendation() env.reset() env.multistep(recommendation.value)
def run_one_trial(env: CompilerEnv, reward_space: str, min_steps: int, max_steps: int) -> Optional[float]: """Run a random number of random steps in an environment and return the cumulative reward. :return: A cumulative reward. """ num_steps = random.randint(min_steps, max_steps) warmup_actions = [env.action_space.sample() for _ in range(num_steps)] env.reward_space = reward_space _, _, done, _ = env.multistep(warmup_actions) if done: return None return env.episode_reward
def run_one_trial( env: CompilerEnv, reward_space: str, action: int, max_warmup_steps: int ) -> Optional[float]: """Run a random number of "warmup" steps in an environment, then compute the immediate reward of the given action. :return: An immediate reward. """ num_warmup_steps = random.randint(0, max_warmup_steps) warmup_actions = [env.action_space.sample() for _ in range(num_warmup_steps)] env.reward_space = reward_space _, _, done, _ = env.multistep(warmup_actions) if done: return None _, (reward,), done, _ = env.step(action, reward_spaces=[reward_space]) return None if done else reward