def try_random(cls, trial_count): def random_fn(board): choices = [cls.UP, cls.RIGHT, cls.DOWN, cls.LEFT] return random.choice(choices) random_fn.info = "Random strategy" do_trials(cls, trial_count, random_fn, always_print=True)
def try_greedy(cls, trial_count): # Greedy, break ties randomly def greedy_fn(board): actions = cls.get_valid_actions_by_reward_from_board(board) assert len(actions) > 0, "No actions available" return actions[0][0] greedy_fn.info = "Greedy strategy" do_trials(cls, trial_count, greedy_fn)
def try_only_go_right(cls, trial_count): def right_fn(board): return cls.RIGHT def right_done(prev, curr, reward, done): return done or prev == curr right_fn.info = "Strategy only moves right" do_trials(cls, trial_count, right_fn, right_done)
def try_down_left_greedy(cls, trial_count): # pick best of {down, left} if available, otherwise best of {up, right} def down_left_greedy_fn(board): actions = cls.get_valid_actions_by_reward_from_board(board) assert len(actions) > 0, "No actions available" for action, reward, board in actions: if action in [cls.DOWN, cls.LEFT]: return action return actions[0][0] down_left_greedy_fn.info = "Down left greedy strategy" do_trials(cls, trial_count, down_left_greedy_fn)
def try_fixed_action_order(cls, trial_count): # Always choose actions in a particular order if they are valid def fixed_fn(board): ACTION_ORDER = [cls.DOWN, cls.LEFT, cls.UP, cls.RIGHT] actions = cls.get_valid_actions_from_board(board) actions = [a for (a, r, b) in actions] assert len(actions) > 0, "No actions available" for action in ACTION_ORDER: if action in actions: return action assert False, "Could not find action" fixed_fn.info = "Fixed order strategy" do_trials(cls, trial_count, fixed_fn)
def try_down_left(cls, trial_count): def down_left_fn(board): action_rewards = cls.get_valid_actions_from_board(board) valid_actions = [a for (a, r, b) in action_rewards] down_left = [cls.DOWN, cls.LEFT] up_right = [cls.UP, cls.RIGHT] random.shuffle(down_left) random.shuffle(up_right) for action in down_left + up_right: if action in valid_actions: return action assert False, "should be able to do something" down_left_fn.info = "Down Left strategy" do_trials(cls, trial_count, down_left_fn)
def try_lookahead_with_rollout(cls, trial_count): lookahead_fn = get_lookahead_fn(cls, LOOKAHEAD_COUNT) def lookahead_with_rollout_fn(board): if len([v for v in board if v == 0]) > ROLLOUT_THRESHOLD: return lookahead_fn(board) else: return best_next_move_from_random_rollouts(cls, board) lookahead_with_rollout_fn.info = f"Lookahead {LOOKAHEAD_COUNT} with " lookahead_with_rollout_fn.info += f"{ROLLOUTS_PER_MOVE} " lookahead_with_rollout_fn.info += f"random rollouts per move " lookahead_with_rollout_fn.info += f"when board has <= {ROLLOUT_THRESHOLD} " lookahead_with_rollout_fn.info += f"empty spaces" do_trials(cls, trial_count, lookahead_with_rollout_fn, always_print=True)
def try_ray_ppo_planning(cls, trial_count): with mlflow.start_run(): ray.init() config = { "env": cls, "num_workers": 3, "num_gpus": 0, "horizon": 5, "train_batch_size": 12000, # val of 128 leads to ~1s per training iteration. } full_config = DEFAULT_CONFIG.copy() for k, v in config.items(): full_config[k] = v pprint(full_config) agent = PPOTrainerWithReset(full_config) strategy_fn = get_strategy_function(cls, agent) strategy_fn.info = "Ray PPO Planning strategy" trial_result = do_trials(cls, trial_count, strategy_fn, max_steps_per_episode=10000, always_print=True) checkpoint = agent.save() print(f"checkpoint saved at {checkpoint}") mlflow.log_metrics(trial_result)
def try_mcts(cls, trial_count): action_space = cls().action_space def q(s, a, n, sum_ret): if n[(tuple(s), a)]: return sum_ret[(tuple(s), a)] / n[(tuple(s), a)] else: return 0 def mcts_policy_fn(board): # return action_space[np.argmax([q(s, a) for a in range(action_space.n)])] action_values = np.array( [q(board, a, n, sum_ret) for a in range(action_space.n)]) # break ties with random coin flip. # return action_space[np.random.choice(np.flatnonzero(action_values == action_values.max()))] action = np.argmax(np.random.multinomial(1, softmax(action_values))) return action def dont_repeat_done(prev, curr, reward, done): return done or prev == curr mcts_policy_fn.info = "MCTS strategy" with mlflow.start_run(): epsilon = 1 discount_rate = 0.9 num_training_iters = 500 rollouts_per_training_iter = 1000 max_steps_per_episode = 20 random_seed = 42 perc_rollouts_full_random = 50 mlflow.log_param("epsilon", epsilon) mlflow.log_param("discount_rate", discount_rate) mlflow.log_param("num_training_iter", num_training_iters) mlflow.log_param("rollouts_per_training_iter", rollouts_per_training_iter) for training_iter in range(num_training_iters): print("training MCTS policy for iteration %s" % training_iter) train_tabular_mcts( cls, mcts_policy_fn, n, sum_ret, num_rollouts=rollouts_per_training_iter, max_steps_per_episode=max_steps_per_episode, epsilon=epsilon, discount_rate=discount_rate, perc_rollouts_full_random=perc_rollouts_full_random, rollout_start_count=rollouts_per_training_iter * training_iter, random_seed=random_seed, print_stats=True, ) print("testing MCTS performance with %s trials for iteration %s" % (trial_count, training_iter)) trial_result = do_trials( cls, trial_count, mcts_policy_fn, random_seed=random_seed, max_steps_per_episode=max_steps_per_episode) mlflow.log_metrics(trial_result, step=training_iter)
def try_greedy_fixed_order(cls, trial_count): ORDER = [cls.UP, cls.DOWN, cls.LEFT, cls.RIGHT] random.shuffle(ORDER) # Greedy, break ties in a fixed order def greedy_fixed_order_fn(board): actions = cls.get_valid_actions_by_reward_from_board(board) top_reward = actions[0][1] equiv_rewards = [a for (a, r, b) in actions if r >= top_reward] assert len(equiv_rewards) > 0, "No actions available" for action in ORDER: if action in equiv_rewards: return action assert False greedy_fixed_order_fn.info = "Greedy strategy with fixed preference" do_trials(cls, trial_count, greedy_fixed_order_fn)
def _try_nick_q_learning(cls, trial_count): start = time.time() i = 0 last_scores_to_store = 10 all_scores = [] game = cls() q_table = QTable(cls) while True: run_episode(game, q_table) all_scores.append(game.score) i += 1 if i % 100 == 0: total_sec = round(time.time() - start, 2) sec_per_iter = round(total_sec / i, 2) max_game_score = round(max(all_scores), 0) mean_game_score = round(sum(all_scores) / len(all_scores), 0) last_score_idx = -1 * last_scores_to_store last_x_scores = all_scores[last_score_idx:] avg_last_x = round(sum(last_x_scores) / len(last_x_scores), 2) print(f"Training iteration {i} " f"({total_sec} sec total, {sec_per_iter} sec per iter)" f"\n\tLast {last_scores_to_store}: {last_x_scores} " f"(avg: {avg_last_x})" f"\n\tMax game score: {max_game_score}" f"\n\tMean game score: {mean_game_score}" f"\n\tSize of state value table: " f"{round(q_table.size_in_mb, 2)}MB" f"\n\tQ table hit rate: " f"{round(q_table.hit_rate, 2)}% " f"({q_table.lookup_hits} out of " f"{q_table.lookup_count})" f"\n\tQ table non-zero hit rate: " f"{round(q_table.nonzero_hit_rate, 2)}% " f"({q_table.lookup_nonzero_hits} out of " f"{q_table.lookup_count})\n") all_scores = [] q_table.reset_counters() if i % 1000 == 0: q_table.reset_counters() def q_learning_benchmark_fn(board): return q_table.get_max_action(board) q_learning_benchmark_fn.info = f"Q-learning iteration {i}" results = do_trials(cls, trial_count, q_learning_benchmark_fn) mlflow.log_metric("max tile", results["Max Tile"], step=i) mlflow.log_metric("max score", results["Max Score"], step=i) mlflow.log_metric("mean score", results["Mean Score"], step=i) mlflow.log_metric("median score", results["Median Score"], step=i) mlflow.log_metric("stdev", results["Standard Dev"], step=i) mlflow.log_metric("min score", results["Min Score"], step=i) mlflow.log_metric("q hit rate", q_table.hit_rate, step=i) mlflow.log_metric("q nonzero hit rate", q_table.nonzero_hit_rate, step=i) mlflow.log_metric("q size", q_table.size_in_mb, step=1) print(f"Q table hit rate: " f"{round(q_table.hit_rate, 2)}% " f"({q_table.lookup_hits} out of " f"{q_table.lookup_count})\n" f"Q table non-zero hit rate: " f"{round(q_table.nonzero_hit_rate, 2)}% " f"({q_table.lookup_nonzero_hits} out of " f"{q_table.lookup_count})\n" f"Size of state value table: " f"{round(q_table.size_in_mb, 2)}MB\n\n" f"=================\n\n")
def try_lookahead(cls, trial_count, lookahead_count): lookahead_fn = get_lookahead_fn(cls, lookahead_count) lookahead_fn.info = f"Lookahead {lookahead_count} strategy" do_trials(cls, trial_count, lookahead_fn)