예제 #1
0
def try_random(cls, trial_count):
    def random_fn(board):
        choices = [cls.UP, cls.RIGHT, cls.DOWN, cls.LEFT]
        return random.choice(choices)

    random_fn.info = "Random strategy"
    do_trials(cls, trial_count, random_fn, always_print=True)
예제 #2
0
def try_greedy(cls, trial_count):
    # Greedy, break ties randomly
    def greedy_fn(board):
        actions = cls.get_valid_actions_by_reward_from_board(board)
        assert len(actions) > 0, "No actions available"
        return actions[0][0]

    greedy_fn.info = "Greedy strategy"
    do_trials(cls, trial_count, greedy_fn)
예제 #3
0
def try_only_go_right(cls, trial_count):
    def right_fn(board):
        return cls.RIGHT

    def right_done(prev, curr, reward, done):
        return done or prev == curr

    right_fn.info = "Strategy only moves right"
    do_trials(cls, trial_count, right_fn, right_done)
예제 #4
0
def try_down_left_greedy(cls, trial_count):
    # pick best of {down, left} if available, otherwise best of {up, right}
    def down_left_greedy_fn(board):
        actions = cls.get_valid_actions_by_reward_from_board(board)
        assert len(actions) > 0, "No actions available"
        for action, reward, board in actions:
            if action in [cls.DOWN, cls.LEFT]:
                return action
        return actions[0][0]

    down_left_greedy_fn.info = "Down left greedy strategy"
    do_trials(cls, trial_count, down_left_greedy_fn)
def try_fixed_action_order(cls, trial_count):
    # Always choose actions in a particular order if they are valid
    def fixed_fn(board):
        ACTION_ORDER = [cls.DOWN, cls.LEFT, cls.UP, cls.RIGHT]
        actions = cls.get_valid_actions_from_board(board)
        actions = [a for (a, r, b) in actions]
        assert len(actions) > 0, "No actions available"
        for action in ACTION_ORDER:
            if action in actions:
                return action
        assert False, "Could not find action"

    fixed_fn.info = "Fixed order strategy"
    do_trials(cls, trial_count, fixed_fn)
예제 #6
0
def try_down_left(cls, trial_count):
    def down_left_fn(board):
        action_rewards = cls.get_valid_actions_from_board(board)
        valid_actions = [a for (a, r, b) in action_rewards]
        down_left = [cls.DOWN, cls.LEFT]
        up_right = [cls.UP, cls.RIGHT]
        random.shuffle(down_left)
        random.shuffle(up_right)
        for action in down_left + up_right:
            if action in valid_actions:
                return action
        assert False, "should be able to do something"

    down_left_fn.info = "Down Left strategy"
    do_trials(cls, trial_count, down_left_fn)
def try_lookahead_with_rollout(cls, trial_count):
    lookahead_fn = get_lookahead_fn(cls, LOOKAHEAD_COUNT)

    def lookahead_with_rollout_fn(board):
        if len([v for v in board if v == 0]) > ROLLOUT_THRESHOLD:
            return lookahead_fn(board)
        else:
            return best_next_move_from_random_rollouts(cls, board)

    lookahead_with_rollout_fn.info = f"Lookahead {LOOKAHEAD_COUNT} with "
    lookahead_with_rollout_fn.info += f"{ROLLOUTS_PER_MOVE} "
    lookahead_with_rollout_fn.info += f"random rollouts per move "
    lookahead_with_rollout_fn.info += f"when board has <= {ROLLOUT_THRESHOLD} "
    lookahead_with_rollout_fn.info += f"empty spaces"
    do_trials(cls, trial_count, lookahead_with_rollout_fn, always_print=True)
예제 #8
0
def try_ray_ppo_planning(cls, trial_count):
    with mlflow.start_run():
        ray.init()
        config = {
            "env": cls,
            "num_workers": 3,
            "num_gpus": 0,
            "horizon": 5,
            "train_batch_size":
            12000,  # val of 128 leads to ~1s per training iteration.
        }
        full_config = DEFAULT_CONFIG.copy()
        for k, v in config.items():
            full_config[k] = v
        pprint(full_config)
        agent = PPOTrainerWithReset(full_config)
        strategy_fn = get_strategy_function(cls, agent)
        strategy_fn.info = "Ray PPO Planning strategy"

        trial_result = do_trials(cls,
                                 trial_count,
                                 strategy_fn,
                                 max_steps_per_episode=10000,
                                 always_print=True)
        checkpoint = agent.save()
        print(f"checkpoint saved at {checkpoint}")
        mlflow.log_metrics(trial_result)
예제 #9
0
def try_mcts(cls, trial_count):
    action_space = cls().action_space

    def q(s, a, n, sum_ret):
        if n[(tuple(s), a)]:
            return sum_ret[(tuple(s), a)] / n[(tuple(s), a)]
        else:
            return 0

    def mcts_policy_fn(board):
        # return action_space[np.argmax([q(s, a) for a in range(action_space.n)])]
        action_values = np.array(
            [q(board, a, n, sum_ret) for a in range(action_space.n)])
        # break ties with random coin flip.
        # return action_space[np.random.choice(np.flatnonzero(action_values == action_values.max()))]
        action = np.argmax(np.random.multinomial(1, softmax(action_values)))
        return action

    def dont_repeat_done(prev, curr, reward, done):
        return done or prev == curr

    mcts_policy_fn.info = "MCTS strategy"
    with mlflow.start_run():
        epsilon = 1
        discount_rate = 0.9
        num_training_iters = 500
        rollouts_per_training_iter = 1000
        max_steps_per_episode = 20
        random_seed = 42
        perc_rollouts_full_random = 50
        mlflow.log_param("epsilon", epsilon)
        mlflow.log_param("discount_rate", discount_rate)
        mlflow.log_param("num_training_iter", num_training_iters)
        mlflow.log_param("rollouts_per_training_iter",
                         rollouts_per_training_iter)
        for training_iter in range(num_training_iters):
            print("training MCTS policy for iteration %s" % training_iter)
            train_tabular_mcts(
                cls,
                mcts_policy_fn,
                n,
                sum_ret,
                num_rollouts=rollouts_per_training_iter,
                max_steps_per_episode=max_steps_per_episode,
                epsilon=epsilon,
                discount_rate=discount_rate,
                perc_rollouts_full_random=perc_rollouts_full_random,
                rollout_start_count=rollouts_per_training_iter * training_iter,
                random_seed=random_seed,
                print_stats=True,
            )
            print("testing MCTS performance with %s trials for iteration %s" %
                  (trial_count, training_iter))
            trial_result = do_trials(
                cls,
                trial_count,
                mcts_policy_fn,
                random_seed=random_seed,
                max_steps_per_episode=max_steps_per_episode)
            mlflow.log_metrics(trial_result, step=training_iter)
예제 #10
0
def try_greedy_fixed_order(cls, trial_count):
    ORDER = [cls.UP, cls.DOWN, cls.LEFT, cls.RIGHT]
    random.shuffle(ORDER)

    # Greedy, break ties in a fixed order
    def greedy_fixed_order_fn(board):
        actions = cls.get_valid_actions_by_reward_from_board(board)
        top_reward = actions[0][1]
        equiv_rewards = [a for (a, r, b) in actions if r >= top_reward]
        assert len(equiv_rewards) > 0, "No actions available"
        for action in ORDER:
            if action in equiv_rewards:
                return action
        assert False

    greedy_fixed_order_fn.info = "Greedy strategy with fixed preference"
    do_trials(cls, trial_count, greedy_fixed_order_fn)
예제 #11
0
def _try_nick_q_learning(cls, trial_count):
    start = time.time()
    i = 0
    last_scores_to_store = 10
    all_scores = []
    game = cls()
    q_table = QTable(cls)
    while True:
        run_episode(game, q_table)
        all_scores.append(game.score)
        i += 1
        if i % 100 == 0:
            total_sec = round(time.time() - start, 2)
            sec_per_iter = round(total_sec / i, 2)
            max_game_score = round(max(all_scores), 0)
            mean_game_score = round(sum(all_scores) / len(all_scores), 0)
            last_score_idx = -1 * last_scores_to_store
            last_x_scores = all_scores[last_score_idx:]
            avg_last_x = round(sum(last_x_scores) / len(last_x_scores), 2)
            print(f"Training iteration {i} "
                  f"({total_sec} sec total, {sec_per_iter} sec per iter)"
                  f"\n\tLast {last_scores_to_store}: {last_x_scores} "
                  f"(avg: {avg_last_x})"
                  f"\n\tMax game score: {max_game_score}"
                  f"\n\tMean game score: {mean_game_score}"
                  f"\n\tSize of state value table: "
                  f"{round(q_table.size_in_mb, 2)}MB"
                  f"\n\tQ table hit rate: "
                  f"{round(q_table.hit_rate, 2)}% "
                  f"({q_table.lookup_hits} out of "
                  f"{q_table.lookup_count})"
                  f"\n\tQ table non-zero hit rate: "
                  f"{round(q_table.nonzero_hit_rate, 2)}% "
                  f"({q_table.lookup_nonzero_hits} out of "
                  f"{q_table.lookup_count})\n")
            all_scores = []
            q_table.reset_counters()
        if i % 1000 == 0:
            q_table.reset_counters()

            def q_learning_benchmark_fn(board):
                return q_table.get_max_action(board)

            q_learning_benchmark_fn.info = f"Q-learning iteration {i}"
            results = do_trials(cls, trial_count, q_learning_benchmark_fn)
            mlflow.log_metric("max tile", results["Max Tile"], step=i)
            mlflow.log_metric("max score", results["Max Score"], step=i)
            mlflow.log_metric("mean score", results["Mean Score"], step=i)
            mlflow.log_metric("median score", results["Median Score"], step=i)
            mlflow.log_metric("stdev", results["Standard Dev"], step=i)
            mlflow.log_metric("min score", results["Min Score"], step=i)
            mlflow.log_metric("q hit rate", q_table.hit_rate, step=i)
            mlflow.log_metric("q nonzero hit rate",
                              q_table.nonzero_hit_rate,
                              step=i)
            mlflow.log_metric("q size", q_table.size_in_mb, step=1)

            print(f"Q table hit rate: "
                  f"{round(q_table.hit_rate, 2)}% "
                  f"({q_table.lookup_hits} out of "
                  f"{q_table.lookup_count})\n"
                  f"Q table non-zero hit rate: "
                  f"{round(q_table.nonzero_hit_rate, 2)}% "
                  f"({q_table.lookup_nonzero_hits} out of "
                  f"{q_table.lookup_count})\n"
                  f"Size of state value table: "
                  f"{round(q_table.size_in_mb, 2)}MB\n\n"
                  f"=================\n\n")
예제 #12
0
def try_lookahead(cls, trial_count, lookahead_count):
    lookahead_fn = get_lookahead_fn(cls, lookahead_count)
    lookahead_fn.info = f"Lookahead {lookahead_count} strategy"
    do_trials(cls, trial_count, lookahead_fn)