def from_values(cls, values: dict): mapping = dict() for s in State.get_all_states(): if s.current_sum < 12: mapping[s] = Action.HIT elif values[StateActionPair(s, Action.STICK)] > values[StateActionPair(s, Action.HIT)]: mapping[s] = Action.STICK else: mapping[s] = Action.HIT return Policy.from_deterministic_mapping(mapping)
def epsilon_greedy_from_values(cls, values: dict, exploring_prob: Callable): mapping = dict() for s in State.get_all_states(): if values[StateActionPair(s, Action.STICK)] > values[StateActionPair(s, Action.HIT)]: mapping[s] = [1. - exploring_prob(), exploring_prob()] else: mapping[s] = [exploring_prob(), 1. - exploring_prob()] return Policy.from_probabilistic_mapping(mapping)
from itertools import product from model.actions import Action from model.policy import Policy from model.state import State, StateActionPair ALL_STATES = State.get_all_states() ALL_STATE_ACTION_PAIRS = [ StateActionPair(s, a) for s, a in product(ALL_STATES, list(Action)) ] class Algorithm: @classmethod def _create_sap_unif_mapping(cls, value): return {sap: value for sap in ALL_STATE_ACTION_PAIRS} @property def policy(self) -> Policy: raise NotImplemented def __init__(self): self._Q = Algorithm._create_sap_unif_mapping(0.) def train(self, rounds: int) -> None: raise NotImplemented class MonteCarloAlgorithm(Algorithm): def __init__(self): super().__init__()