Exemplo n.º 1
0
def runMC(size):
    q = np.zeros((2, 11, 22))
    counterSA = np.zeros((2, 11, 22))
    counterState = np.zeros((11, 22))

    for i in range(size):
        # sample one episode
        s0 = easy21.init()
        s = s0
        #print("init ", s)
        episodes = []
        while s[0] != 1:
            counterState[s[1:]] += 1

            a = getAction(s, q, counterState)
            #print("action: ", a)
            sa = (a, s[1], s[2])
            counterSA[sa] += 1
            episodes.append(sa)
            sprime = easy21.step(s, a)
            s = sprime
            #print("state: ", s)
        #print("result: ", s[2])

        for state in episodes:
            #print("s,a ", state)
            q[state] += (1 / counterSA[state]) * (s[2] - q[state])
    return q
Exemplo n.º 2
0
    def train(self):
        """Train the agent on NUM_EPISODES to learn the action-value function"""
        elapsed_episodes = 0
        while elapsed_episodes < const.SARSA_NUM_EPISODES:
            eligibility_trace = np.zeros(const.STATE_ACTION_SPACE)
            state = easy21.new_game()
            epsilon = const.N_0 / (const.N_0 + self.num_visits(state))
            action = epsilon_greedy(epsilon, self.Q, state)
            while state.terminal != const.TERMINAL:
                # Take 1 step forward from the current state and simulate action
                next_state, reward = easy21.step(state, action)
                epsilon = const.N_0 / (const.N_0 + self.num_visits(next_state))
                alpha = 1 / self.state_action_count(state, action)
                next_action = epsilon_greedy(epsilon, self.Q, next_state)

                # Compute TD-error and increment counts
                cur_idx = (state.player, state.dealer, action)
                next_idx = (next_state.player, next_state.dealer, next_action)
                td_error = reward + self.Q[next_idx] - self.Q[cur_idx]
                eligibility_trace[cur_idx] += 1
                self.state_action_counts[cur_idx] += 1
                self.state_visits[state.player - 1, state.dealer - 1] += 1

                # Update action-value function and eligibility traces
                self.Q += alpha * td_error * eligibility_trace
                eligibility_trace *= const.LAMBDA
                state = next_state
                action = next_action

            elapsed_episodes += 1
Exemplo n.º 3
0
def run_episode(Q_sa, N_sa, lambd):
    E_sa = np.zeros((2, 11, 22))
    gamma = 1
    dealers_first_card = easy21.get_first_card()
    players_first_card = easy21.get_first_card()
    state = easy21.State(dealers_first_card, players_first_card)
    # initialize ACTION
    action = easy21.get_action(state, Q_sa, N_sa)
    assert action in [0, 1], str(action)
    game_history = []
    while not state.is_terminal:
        old_players_sum = state.players_sum
        old_dealers_card = state.dealers_card
        old_action = action
        next_state, reward = easy21.step(state, action)
        if not next_state.is_terminal:
            next_action = easy21.get_action(state, Q_sa, N_sa)
            delta = reward + \
                    gamma * \
                    Q_sa[next_action, next_state.dealers_card, next_state.players_sum] - \
                    Q_sa[old_action, old_dealers_card, old_players_sum]
        else:
            next_action = 0
            delta = reward + gamma * 0 - Q_sa[old_action, old_dealers_card,
                                              old_players_sum]
        N_sa[old_action, old_dealers_card, old_players_sum] += 1
        alpha = 1 / N_sa[old_action, old_dealers_card, old_players_sum]
        E_sa[old_action, old_dealers_card, old_players_sum] += 1
        Q_sa = Q_sa + alpha * delta * E_sa
        E_sa = gamma * lambd * E_sa
        state = next_state
        action = next_action
    return (Q_sa, N_sa, reward)
Exemplo n.º 4
0
def run_episode(theta, lambd, feature_map):
    E_sa = np.zeros(36)
    gamma = 1
    dealers_first_card = easy21.get_first_card()
    players_first_card = easy21.get_first_card()
    state = easy21.State(dealers_first_card, players_first_card)
    # initialize ACTION
    action = get_feature_action(state, theta, feature_map)
    assert action in [0, 1], str(action)
    game_history = []
    while not state.is_terminal:
        old_players_sum = state.players_sum
        old_dealers_card = state.dealers_card
        old_action = action
        next_state, reward = easy21.step(state, action)
        if not next_state.is_terminal:
            next_action = get_feature_action(state, theta, feature_map)
            delta = reward + \
                    gamma * \
                    np.dot(feature_map[next_action, next_state.dealers_card, next_state.players_sum], theta) - \
                    np.dot(feature_map[old_action, old_dealers_card, old_players_sum], theta)
        else:
            next_action = 0
            delta = reward + gamma * 0 - np.dot(
                feature_map[old_action, old_dealers_card, old_players_sum],
                theta)
        alpha = 0.01
        E_sa = gamma * lambd * E_sa + feature_map[
            (old_action, old_dealers_card, old_players_sum)]
        theta = theta + alpha * delta * E_sa
        state = next_state
        action = next_action
    return (theta, reward)
Exemplo n.º 5
0
def trial():
    state = State.new()
    states = [state]
    while not state.terminated:
        action = next_action(state)
        N[state] += 1
        N[(state, action)] += 1
        state, reward = step(state, action)
    update(states, reward)
Exemplo n.º 6
0
def play_from_state(state):
  is_terminal = False
  history = []

  while not is_terminal:
    action = get_action(state)
    history.append((state, action))
    reward, state, is_terminal = easy21.step(state, action)
  
  return reward, history
Exemplo n.º 7
0
def run_episode(Q_sa, N_sa):
    dealers_first_card = easy21.get_first_card()
    players_first_card = easy21.get_first_card()
    state = easy21.State(dealers_first_card, players_first_card)
    reward = None
    history = []
    action = ""
    while not state.is_terminal:
        action = easy21.get_action(state, Q_sa, N_sa)
        old_state = copy.deepcopy(state)
        state, reward = easy21.step(state, action)
        history.append((old_state, action, reward))
    return (history)
Exemplo n.º 8
0
def runTD(size, tdLambda):
    # init q
    q = np.zeros((2, 11, 22))  # action, dc, sum
    counterSA = np.zeros((2, 11, 22))
    counterState = np.zeros((11, 22))
    # plot learning curve
    sqrErr = []
    mcq = np.load("montecarlo-qsa.npy")

    # each episode
    for i in range(size):
        # Eligibility
        e = np.zeros((2, 11, 22))
        # init one episode
        s0 = easy21.init()
        s = s0
        a = getAction(s, q, counterState)
        #print("init ", s)
        # each step
        while s[0] != 1:
            #print("action: ", a)
            counterState[s[1:]] += 1

            # get s' and a'
            sprime = easy21.step(s, a)
            aprime = getAction(
                sprime, q, counterState
            ) if sprime[0] != 1 else 0  #  handle a' for terminal s'

            # err
            sa, saprime = (a, s[1], s[2]), (aprime, sprime[1], sprime[2])
            counterSA[sa] += 1
            alpha = 1 / counterSA[sa]

            r = 0 if sprime[0] != 1 else sprime[2]
            g = q[saprime] if sprime[0] != 1 else 0
            g += r
            err = g - q
            e[sa] += 1

            # update q matric
            q = q + alpha * err * e
            e = e * tdLambda

            s, a = sprime, aprime
            #print("state: ", s)
        #print("result: ", s[2])
        sqrErr.append(np.power((q - mcq), 2).mean())
    return q, sqrErr
Exemplo n.º 9
0
def runTD(size, tdLambda):    
    # init q
    qw = np.zeros((36, 1)) # weight w of q
    # plot learning curve
    sqrErr = []
    mcq = np.load("montecarlo-qsa.npy")

    # each episode
    for i in range(size):
        # Eligibility
        e = np.zeros((2, 3, 6))
        # init one episode
        s0 = easy21.init()
        s = s0
        a = getAction(s, qw)
        #print("init ", s)
        # each step
        while s[0] != 1:                       
            #print("action: ", a)            
            # get s' and a'
            sprime = easy21.step(s, a)
            aprime = getAction(sprime, qw) if sprime[0] != 1 else 0 

            # err 
            sa, saprime = (a, s[1], s[2]), (aprime, sprime[1], sprime[2])
            alpha = 0.01

            r = 0 if sprime[0] != 1 else sprime[2]
            g = getQValue(aprime, sprime[1], sprime[2], qw) if sprime[0] != 1 else 0
            g += r
            err = g - getQValue(a, s[1], s[2], qw)
            indList = convertSA(a, s[1], s[2])
            for ind in indList:
                e[ind[0], ind[1], ind[2]] += 1

            # update q matric  
            qw = qw + alpha * err * e.reshape(36, 1)  
            e = e * tdLambda
            
            s,a = sprime, aprime
            #print("state: ", s)
        #print("result: ", s[2])
        sqrErr.append(np.power((getQMatrix(qw) - mcq), 2).mean())
    return qw, sqrErr
Exemplo n.º 10
0
def sarsa(lamb, all_errors=False):
    Q = defaultdict(int)
    N = defaultdict(int)
    errors = []

    for episode_i in range(EPISODES):
        if args.progress and episode_i % (0.1 * EPISODES) == 0:
            print('{:.2f}%...'.format((episode_i + EPISODES * lamb * 10) *
                                      100 / (EPISODES * len(lambdas))),
                  end='\r',
                  flush=True)

        E = defaultdict(int)
        state = easy21.init_state()
        action = get_action(Q, N, state)
        N[state, action] += 1
        is_terminal = False

        while not is_terminal:
            reward, new_state, is_terminal = easy21.step(state, action)
            new_action = get_action(Q, N, new_state)

            E[state, action] += 1
            N[new_state, new_action] += 1
            d = reward + DISCOUNT * Q[new_state, new_action] - Q[state, action]

            for (state, action), e in E.items():
                Q[state, action] += step_size(N, (state, action)) * d * e
                E[state, action] *= DISCOUNT * lamb

            state, action = new_state, new_action

        if all_errors: errors.append(calculate_error(Q))

    if all_errors:
        return errors, Q
    else:
        return calculate_error(Q), Q
Exemplo n.º 11
0
    def train(self):
        """Train the action-value function on NUM_EPISODES games."""
        elapsed_episodes = 0
        while elapsed_episodes < const.MC_NUM_EPISODES:
            state = easy21.new_game()
            episode = []  # [ (state_0, action_0, reward_1, state_1), ... ]

            # Experience 1 episode
            while state.terminal == const.NON_TERMINAL:
                epsilon = const.N_0 / (const.N_0 + self.num_visits(state))
                action = epsilon_greedy(epsilon, self.Q, state)
                next_state, rwd = easy21.step(state, action)
                episode.append((state, action, rwd, next_state))
                state = next_state

            # Learn from experienced episode
            total_return = episode[-1][2]
            for (state, action, reward, next_state) in episode:
                self.increment_state_action_counts(state, action)
                self.increment_state_visits(state)
                alpha_t = 1 / self.state_action_count(state, action)
                self.update_q_function(state, action, alpha_t, total_return)

            elapsed_episodes += 1
Exemplo n.º 12
0
def run_episode(Q_sa, N_sa):
    gamma = 1
    dealers_first_card = easy21.get_first_card()
    players_first_card = easy21.get_first_card()
    state = easy21.State(dealers_first_card, players_first_card)
    while not state.is_terminal:
        # initialize ACTION
        action = easy21.get_egreedy_action(state, Q_sa, N_sa)
        # action = easy21.get_random_action() # we could also run this off policy!!
        old_state = copy.deepcopy(state)
        next_state, reward = easy21.step(state, action)
        if state.is_terminal:
            target = reward
        else:
            target = reward + gamma * np.max(Q_sa[:, state.dealers_card,
                                                  state.players_sum])
        N_sa[action, old_state.dealers_card, old_state.players_sum] += 1
        # alpha = 1 / N_sa[action, old_state.dealers_card, old_state.players_sum]
        alpha = 0.05
        Q_sa[action, old_state.dealers_card,
             old_state.players_sum] += +alpha * (
                 target -
                 Q_sa[action, old_state.dealers_card, old_state.players_sum])
    return (Q_sa, N_sa, reward)
            Z = defaultdict(float)
            state = State()
            e = epsilon(Nzero, Ns, state)
            action = greedysoft(state, actionvalue, e)
            episode = []

            while state.gameover==0:

                Nsa[(state.player, state.dealer, action)] += 1
                Ns[(state.player, state.dealer)] += 1
                episode += [(state.player, state.dealer, action)]
                startstate = copy(state)

                # Take action A, observe reward, S
                state, reward = step(state, action)
                e = epsilon(Nzero, Ns, state)

                # Choose A' from S' using Q policy
                ingameaction = greedysoft(state, actionvalue, e)

                # Delta = R + gamma*Q' + Q; Z = Z + 1
                d = reward + ( discount * actionvalue[(state.player, state.dealer, ingameaction)] ) - actionvalue[(startstate.player, startstate.dealer, action)]
                Z[(startstate.player, startstate.dealer, action)] += 1
                stateaction = (state.player, state.dealer, action)

                for stateaction in episode:
                    a = Nsa[stateaction] ** -1
                    actionvalue[stateaction] += a * d * Z[stateaction]
                    Z[stateaction] *= lamBda
Exemplo n.º 14
0
import colorama
import easy21

colorama.init(convert=True)


while True:
  state = easy21.init_state()
  print('state:', state)

  is_terminal = False
  while not is_terminal:
    print('action ({}it/{}tick): '.format(colorama.Fore.LIGHTGREEN_EX + 'h' + colorama.Style.RESET_ALL, colorama.Fore.LIGHTRED_EX + 's' + colorama.Style.RESET_ALL), end='')
    action = input()
    reward, state, is_terminal = easy21.step(state, action)
    print('state:', state)

  print('reward:', reward)
  print('\n')
Exemplo n.º 15
0
    # player and dealer each draw one black card
    dealer = np.random.randint(1, 11)
    player = np.random.randint(1, 11)
    deal = ([dealer, player], 0)

    # play one hand
    history = []
    reward = None
    while reward == None:

        dealer = deal[0][0]
        player = deal[0][1]
        action = epsilon_greedy(N0, N, Q, dealer, player)
        N[dealer - 1, player - 1, action] += 1
        deal = step([dealer, player], action)
        reward = deal[1]
        history.append(([dealer, player], action, reward))

    # update the Q matrix for this hand - based on https://github.com/xrz000/Easy21
    Gt = 0
    for j, ([dealer, player], action, reward) in enumerate(reversed(
            history)):  # backtrack from the end of the hand to the beginning
        alpha = 1.0 / N[
            dealer - 1, player - 1,
            action]  # update factor as per the assignment. As you have more visits to the state, the update factor decreases.
        if reward == None:
            reward = 0
        Gt = gamma * Gt + reward  # cumulative reward for each move in the hand. Since the reward is only given at the end of the hand, this is the same for all states.
        Q[dealer - 1, player - 1, action] += alpha * (
            Gt - Q[dealer - 1, player - 1, action]