Пример #1
0
def sarsa_control(start_states: Distribution[S],
                  transition_fcn: Callable[[S, A], Tuple[S, float]],
                  state_action: Mapping[S, List[A]],
                  approx_0: FunctionApprox[Tuple[S, A]], gamma: float,
                  ϵ: float) -> Iterable[FunctionApprox[Tuple[S, A]]]:
    """
    Update Q-value function approximate using SARSA
    Initialize first state by start_states
    """
    q = approx_0
    state = start_states.sample()
    action = Choose(set(state_action[state])).sample()
    while True:
        # next_state, reward = transition_fcn(state, action)
        next_state, reward = transition_fcn[state][action].sample()
        # use ϵ-greedy policy to get next_action
        explore = Bernoulli(ϵ)
        if explore.sample():
            next_action = Choose(set(state_action[next_state])).sample()
        else:
            next_action = state_action[next_state][np.argmax(
                [q((next_state, a)) for a in state_action[next_state]])]
        q = q.update([(state, action), reward + gamma * q(
            (next_state, next_action))])
        state, action = next_state, next_action
        yield q
Пример #2
0
def lspi(memory: List[mdp.TransitionStep[S]], feature_map: Dict[Tuple[S, A],
                                                                List[float]],
         state_action: Dict[S, List[A]], m: int, gamma: float,
         ϵ: float) -> Iterable[Dict[Tuple[S, A], float]]:
    """
    update A and b to get w*= inverse(A)b and update deterministic policy
    feature_map:  key: state, value: phi(s_i) is a vector of dimension m
    """
    # initialize A, b
    A = np.random.rand(m, m)
    b = np.zeros((m, 1))
    w = np.linalg.inv(A) @ b
    while True:
        transition = random.choice(memory)
        state = transition.state
        next_state = transition.next_state
        feature_state = np.array(feature_map[(state, transition.action)])
        # next_action is derived from ϵ-policy
        explore = Bernoulli(ϵ)
        if explore.sample():
            next_action = Choose(set(state_action[next_state])).sample()
        else:
            next_action = state_action[next_state][np.argmax([
                np.array(feature_map[(next_state, action)]) @ w
                for action in state_action[next_state]
            ])]
        feature_next_state = np.array(feature_map[(next_state, next_action)])
        A += feature_state @ (feature_state - gamma * feature_next_state).T
        b += feature_state * transition.reward
        w = np.linalg.inv(A) @ b
        yield {
            s_a: np.array(feature_map[s_a]) @ w
            for s_a in feature_map.keys()
        }