示例#1
0
def linear_approx_sarsa(max_episode, alpha, e, gamma, lbd, optimal_Q=None):
    theta = np.random.randn(36) / 100
    mse = []
    for i in range(max_episode):
        # initial eligibility traces
        E = np.zeros([
            36,
        ])
        # initial a new episode
        episode = Easy21()
        x, y = episode.observe()
        action = cal_action([x, y], theta, e)
        # sample until terminal
        while not episode.is_terminal():
            # run one step
            ([xp, yp], reward) = episode.step(action)
            if episode.is_terminal():
                # if the episode is in terminal state, Q[s', a'] is 0
                q0 = cal_q([x, y], action, theta)
                delta = reward - q0
                actionp = 0
            else:
                actionp = cal_action([xp, yp], theta, e)
                q0 = cal_q([x, y], action, theta)
                q1 = cal_q([xp, yp], actionp, theta)
                delta = reward + gamma * q1 - q0
            E = E * (gamma * lbd) + q_gradient([x, y], action)
            theta += (alpha * delta * E)
            x, y, action = xp, yp, actionp
        if (i % 1000 == 0) and (optimal_Q is not None):
            mse.append(np.sum((cal_q_table(theta) - optimal_Q)**2))
    return (theta, mse)
示例#2
0
文件: sarsa.py 项目: captn3m0/easy21
    def run(self):
        self.reset()
        game = Easy21()
        S = game.state()
        A = self.epsilon_greedy_action(S)
        while not game.isTerminal():
            Aprime = None
            game, R = game.step(A)
            Sprime = game.state()
            self.N[S][A] += 1
            self.E[S][A] += 1
            # Initialize Q to zero for "all" states
            # Our lookup table is only for interesting states
            # So we hack around by putting Q = 0
            if game.isTerminal():
                Q = 0
            else:
                Aprime = self.epsilon_greedy_action(Sprime)
                Q = self.q[Sprime][Aprime]
            """ This is our TD error """
            alpha = self.alpha(S, A)
            delta = R + Sarsa.GAMMA * Q - self.q[S][A]
            self.update(alpha, delta)

            S = Sprime
            A = Aprime
示例#3
0
def sarsa_lambda(max_episode, gamma, lbd, N0, optimal_Q=None):
    # sarsa(lambda)
    Q = np.zeros([10, 21, 2])
    N = np.zeros([10, 21, 2])
    mse = []
    for i in range(max_episode):
        # initial eligibility traces
        E = np.zeros([10, 21, 2])
        # initial a new episode
        episode = Easy21()
        x, y = episode.observe()
        action = epsilon_greedy(N0, N, Q, x, y)
        # sample until terminal
        while not episode.is_terminal():
            N[x - 1, y - 1, action] += 1
            E[x - 1, y - 1, action] += 1
            # run one step
            ([xp, yp], reward) = episode.step(action)
            if episode.is_terminal():
                # if the episode is in terminal state, Q[s', a'] is 0
                delta = reward - Q[x - 1, y - 1, action]
                actionp = 0
            else:
                actionp = epsilon_greedy(N0, N, Q, xp, yp)
                delta = reward + gamma * Q[xp - 1, yp - 1, actionp] - Q[x - 1,
                                                                        y - 1,
                                                                        action]
            alpha = 1.0 / N[x - 1, y - 1, action]
            Q += (alpha * delta * E)
            E *= (gamma * lbd)
            x, y, action = xp, yp, actionp
        if (i % 1000 == 0) and (optimal_Q is not None):
            mse.append(np.sum((Q - optimal_Q)**2))
    return (Q, mse)
示例#4
0
def monte_carlo(max_episode, gamma, N0):
    # monte carlo
    Q = np.zeros([10, 21, 2])
    N = np.zeros([10, 21, 2])
    for i in range(max_episode):
        # initial a new episode
        episode = Easy21()
        # the initial state of the episode
        x, y = episode.observe()
        # sample until terminal
        history = []
        while not episode.is_terminal():
            # decide action
            action = epsilon_greedy(N0, N, Q, x, y)
            N[x - 1, y - 1, action] += 1
            # run one step
            (state, reward) = episode.step(action)
            history.append(([x, y], action, reward))
            [x, y] = state
        # calculate return Gt for each state in this episode
        Gt = 0
        for j, (state, action, reward) in enumerate(reversed(history)):
            [x, y] = state
            alpha = 1.0 / N[x - 1, y - 1, action]
            Gt = gamma * Gt + reward
            Q[x - 1, y - 1, action] += alpha * (Gt - Q[x - 1, y - 1, action])
    return Q
示例#5
0
    def test_initialization(self):
        for i in range(30):
            game = Easy21()
            state = game.state()

            # [ dealer's sum ,  player's sum ]
            self.assertIsInstance(state, list)
            self.assertEqual(len(state), 2)

            self.assertTrue(state[0] in range(1, 11))
            self.assertTrue(state[1] in range(1, 22))

            state, reward = game.step(STICK)
            self.assertEqual(state, TERMINATED)
            self.assertTrue(reward in [-1, 0, 1])
示例#6
0
 def run(self):
   game = Easy21()
   G = None
   walk = []
   while True:
     s = game.state()
     action = self.epsilon_greedy_action(s)
     self.N[s][action] += 1
     game,G = game.step(action)
     # We break if the game has ended
     walk.append((s,action))
     if game.isTerminal():
       break
   for s, action in walk:
     self.q[s][action] = self.q[s][action] + self.alpha(s, action) * (G - self.q[s][action])
示例#7
0
文件: mc.py 项目: matthewygf/RL_DS
def main():
    #initialize
    env = Easy21()
    runs = 1000

    for t in range(runs):
        env.start()
        results = []
        total_reward = 0.0
        start_dealer, start_player = env.state()
        print (" started at start state dealer %d and player %d" % (start_dealer, start_player))
        # run our episode
        while not env.is_finished():
            current_state = env.state()
            dealer, player = current_state
            print (" start at current state dealer %d, player %d" % (dealer, player))
            action = ep_greedy(current_state)
            print ("ep-greedy picked action %d" % action)
            next_state, reward = env.step(current_state, action)
            print ("------------------------------------")
            print (next_state)
            print (reward)
            print ("------------------------------------")
            # MC record our current state and the reward from this state onwards
            results.append((current_state, action))
            total_reward += reward
            if(dealer <= 10 and player <= 21):
                current_state = next_state

        # after an end of the episode
        for (state, action) in results:
            # incremet the state count
            dealer, player = state
            possible_states[dealer-1, player-1] += 1
            possible_states_actions[dealer-1, player-1, action] += 1
            # step-size
            alpha = 1/possible_states_actions[dealer-1, player-1, action]
            # update our value function in the direction towards the reward
            q_states_actions[dealer-1, player-1, action] += alpha * (total_reward - q_states_actions[dealer-1, player-1, action])

    with open('q_true.txt', 'w+') as outfile:
        outfile.write('# Array shape: {0}\n'.format(q_states_actions.shape))

        # equivalent to array[i,:,:]
        for data_slice in q_states_actions:
            np.savetxt(outfile, data_slice, fmt='%-7.2f')

            outfile.write('# New slice \n')
示例#8
0
 def run(self):
     self.reset()
     game = Easy21()
     S = game.state()
     A = self.epsilon_greedy_action(S)
     while not game.isTerminal():
         Aprime = None
         game, R = game.step(A)
         Sprime = game.state()
         # Initialize Q to zero for "all" states
         # Our lookup table is only for interesting states
         # So we hack around by putting Q = 0
         if game.isTerminal():
             Q = 0
         else:
             Aprime = self.epsilon_greedy_action(Sprime)
             Q = self.Q(Sprime, Aprime)
         """ This is our TD error """
         delta = R + Sarsa2.GAMMA * Q - self.Q(S, A)
         features = FeatureExtractor(S, A).features()
         self.update(delta, features)
         S = Sprime
         A = Aprime
示例#9
0
文件: main.py 项目: rapsealk/Easy21
 def setUp(self):
     self.env = Easy21()
     self.state = self.env.reset()
示例#10
0
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
import numpy as np


def game_state_to_control_state(game_state):
    return PLAYER_MAX*(game_state[0]-1) + game_state[1] - 1


# start learning
control = TDControl(CARD_MAX*PLAYER_MAX, len(ACTIONS))
control_mc = MCControl(CARD_MAX*PLAYER_MAX, len(ACTIONS))

for i in range(50000):
    game = Easy21(verbose=False)
    episode = []
    state = game.state()

    while state != TERMINATED:
        action = control.action(game_state_to_control_state(state))
        next_state, reward = game.step(ACTIONS[action])

        control.experience(game_state_to_control_state(state), action, reward)
        control_mc.experience(game_state_to_control_state(state), action, reward)

        # input('')
        # print(control.Qsa[game_state_to_control_state(state)][action])
        # print(control.Esa[game_state_to_control_state(state)][action])
        state = next_state
示例#11
0
def main():
    # initialise our env
    env = Easy21()
    # run 1000 episodes
    runs = 1000
    
    global q_states_actions
    global possible_states
    global possible_states_actions
    global GAMMA

    for x in [x * 0.1 for x in range(0, 10)]:
        lambda_x = x
        # run our episodes
        for t in range(runs):
            # initialize our records
            results = []
            # start the game
            env.start()
            # initialize first state and action
            start_state = env.state()  
            action = ep_greedy(start_state)

            # re-init eligibility traces each episode
            e_states_actions = np.zeros((DEALER_STATE, PLAYER_STATE, ACTIONS), dtype=np.float32)

            while not env.is_finished():
                current_state = env.state()
                # take action A, observe S' , Reward
                next_state, reward = env.step(current_state, action)
                next_dealer, next_player = next_state
                #print ("next_dealer is %d, next_player is %d, immediate reward is %d" % (next_dealer, next_player, reward))
                current_dealer, current_player = current_state
                current_q_values = q_states_actions[current_dealer-1, current_player-1, action]
                # if we should still proceed
                if(next_dealer <= 10 and next_player <= 21):
                    # choose A' from S' using e-greedy
                    next_action = ep_greedy(next_state)

                    # calculate the TD error (delta)
                    next_q_values = q_states_actions[next_dealer-1, next_player-1, next_action]
                    td_error = reward + (GAMMA * next_q_values) - current_q_values
                else:
                    td_error = reward - current_q_values
                
                # add counts during the episode
                e_states_actions[current_dealer-1, current_player-1, action] += 1
                possible_states[current_dealer-1, current_player-1] += 1
                possible_states_actions[current_dealer-1, current_player-1, action] += 1
                # step-size
                alpha = 1/possible_states_actions[current_dealer-1, current_player-1, action]
                results.append((current_state, action))

                # update the action-value for the whole of episode of each steps
                for (state, action) in results:
                    # update all action-value and eligilibity traces in the results
                    dealer, player = state
                    q_states_actions[dealer-1, player-1, action] += alpha * td_error * e_states_actions[dealer-1, player-1, action]
                    e_states_actions[dealer-1, player-1, action] = GAMMA * lambda_x * e_states_actions[dealer-1, player-1, action]

                if(next_dealer <= 10 and next_player <= 21):
                    current_state = next_state
                    action = next_action

        # Read the q true from disk
        q_true_txt = np.loadtxt('q_true.txt')

        # original shape of the array
        q_true = q_true_txt.reshape((DEALER_STATE,PLAYER_STATE,ACTIONS))
        q_diff = q_states_actions - q_true
        print(np.mean(np.square(q_diff)))
示例#12
0
# epsilon-greedy policy(action) selection from current value function
def eps_greedy(state, n0=100.0):
    s1, s2 = state
    epsilon = n0 / (n0 + n_state[s1, s2])

    if random.random() < epsilon:
        # random selection
        if random.random() < 0.5: return 0
        else: return 1
    else:
        # greedy selection
        return np.argmax(q_sa[s1, s2, :])


my_game = Easy21()
max_epoch = 100000

for epoch in range(max_epoch):
    curr_state = my_game.init_game()
    results_list = []
    result_sum = 0.0

    # start and finish one episode
    while curr_state is not None:
        curr_action = eps_greedy(curr_state)
        next_state, curr_result = my_game.step(curr_action)
        results_list.append((curr_state, curr_action))
        result_sum += curr_result
        curr_state = next_state
示例#13
0
文件: td_control.py 项目: vuk119/RL
                state = next_state
                action = next_action

            norms.append(np.linalg.norm(self.Q[1:22, 1:11, :]))

        if log:
            print("Completed the total of {} episodes".format(n_episodes))
            print("The total time taken is {}s".format(time.time() - start))

        return self.Q, norms


from easy21 import Easy21, Action

env = Easy21()
env_name = 'Easy21'
TD = TDLambda(env, 2, (22, 22))

Q = TD.train(n_episodes=100000)

V = np.max(Q, axis=-1)
relevant_V = V[1:22, 1:11]

import matplotlib.pyplot as plt

policy = np.argmax(Q, axis=-1)
plt.subplot(121)
plt.imshow(V)
plt.subplot(122)
plt.imshow(relevant_V)
示例#14
0
 def setUp(self):
     self.env = Easy21()
示例#15
0
def main():
    # initialise our env
    env = Easy21()
    # run 1000 episodes
    runs = 1000

    global q_states_actions
    global DEALERS_FEATURE
    global PLAYERS_FEATURE
    global ACTIONS_FEATURE

    for x in [x * 0.1 for x in range(0, 10)]:
        lambda_x = x
        win = 0
        for t in range(runs):
            #initialize our records
            results = []
            # start the game
            env.start()
            # initialize first state and action
            action = ep_greedy(env.state())

            # initialize our eligibility traces for each episode
            traces = np.zeros(
                (DEALERS_FEATURE * PLAYERS_FEATURE * ACTIONS_FEATURE),
                dtype=np.float32)
            # initialize weights should have the same number as features
            weights = np.random.uniform(0,
                                        1,
                                        size=DEALERS_FEATURE *
                                        PLAYERS_FEATURE * ACTIONS_FEATURE)
            q_states_actions = np.zeros(
                (DEALERS_STATE, PLAYERS_STATE, ACTIONS_FEATURE),
                dtype=np.float32)

            while not env.is_finished():
                current_state = env.state()
                current_dealer, current_player = current_state
                # take action A, observe S', Reward
                next_state, reward = env.step(current_state, action)
                next_dealer, next_player = next_state
                # choose an action based on policy with features (http://artint.info/html/ArtInt_272.html)
                current_q_values = Q_approx(current_dealer - 1,
                                            current_player - 1, action,
                                            weights)
                current_feature_vec = Features(current_dealer - 1,
                                               current_player - 1, action)

                # if we should still proceed
                if (next_dealer <= 10 and next_player <= 21):
                    # choose A' from S' using e-greedy
                    next_action = ep_greedy(next_state)

                    # calculate the TD error (delta)
                    next_q_values = Q_approx(current_dealer - 1,
                                             current_player - 1, action,
                                             weights)
                    td_error = reward + (GAMMA *
                                         next_q_values) - current_q_values
                else:
                    td_error = reward - current_q_values

                # backward view lambda
                traces = GAMMA * lambda_x * traces + current_feature_vec
                weights += ALPHA * td_error * traces
                if reward == 1:
                    win += 1

            # Read the q true from disk
            q_true_txt = np.loadtxt('q_true.txt')

            # original shape of the array
            q_true = q_true_txt.reshape(
                (DEALERS_STATE, PLAYERS_STATE, ACTIONS_FEATURE))
            q_diff = q_states_actions - q_true
            #print(np.mean(np.square(q_diff)))

        print("lambda %f, won %d" % (lambda_x, win))