def select_best_action(env: CompilerEnv, executor: ThreadPoolExecutor) -> RewardAction: """Determine the best action by trying all possible options and ranking them.""" def eval_action(fkd: CompilerEnv, action: int) -> RewardAction: """Evaluate the given action.""" try: _, reward, _, _ = fkd.step(action) finally: fkd.close() return RewardAction(reward=reward, action=action) # Select the best action using the reward that the action produces, then # action index as a tie-breaker. Do this by creating n forks of the # environment, one for every action, and evaluting the actions in parallel # threads. Note that calls to fork() occur in the main thread for thread # safety in case of environment restart. futures = (executor.submit(eval_action, env.fork(), action) for action in range(env.action_space.n)) best_reward_action = RewardAction(reward=-float("inf"), action=0) for future in as_completed(futures): reward_action: RewardAction = future.result() if reward_action > best_reward_action: best_reward_action = reward_action return best_reward_action
def test_fork(env: CompilerEnv): env.reset() env.step(0) env.step(1) other_env = env.fork() try: assert env.benchmark == other_env.benchmark assert other_env.actions == [0, 1] finally: other_env.close()
def hill_climb(env: CompilerEnv): best = float("inf") for _ in range(FLAGS.gcc_search_budget): with env.fork() as fkd: fkd.choices = [ random.randint(max(-1, x - 5), min(len(env.gcc_spec.options[i]) - 1, x + 5)) for i, x in enumerate(env.choices) ] cost = objective(fkd) if cost < objective(env): best = cost env.choices = fkd.choices return best
def test_fork(benchmark, env: CompilerEnv, benchmark_name): env.reset(benchmark_name) benchmark(lambda: env.fork().close())