示例#1
0
文件: lfa.py 项目: zhan0903/easy21
def expand_Q(w):
    Q = np.zeros((10, 21, 2))

    for dealer in DEALER_RANGE:
        for player in PLAYER_RANGE:
            for action in ACTIONS:
                #state = (dealer, player)
                state = State()
                state.dealercard = dealer
                state.playersum = player
                feats = phi(state, action)
                Q[dealer - 1, player - 1][action] = np.sum(feats * w)

    return Q
示例#2
0
def Lfa():
    lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    learning_curves = {}
    state1 = State()
    num_episodes = 2000

    with open("./Q_dump_episodes_1000000.pkl", "rb") as f:
        opt_value = pickle.load(f)

    for item in lmbd:
        state1.dealercard = random.randint(1,10)
        state1.playersum = random.randint(1,10)
        Q_value,error_history = lfa_learn(item,opt_value,num_episodes)
        learning_curves[item] = error_history

    plot_file = ("./outcome/lfa_error_{}_episodes_time_{}.pdf".format(20000,time.time()))
    plot_learning_curve(learning_curves, save=plot_file)
示例#3
0
文件: sarsa.py 项目: zhan0903/easy21
def Sarsa_lamda_Control(lmbd, opt_value, num_episodes):
    #initialize
    value = np.zeros((10, 21, 2))
    counter = np.zeros((10, 21, 2))
    totalreward = 0
    error_history = []

    for episode in range(1, num_episodes + 1):
        # initialize env
        state1 = State()
        state1.dealercard = random.randint(1, 10)
        state1.playersum = random.randint(1, 10)

        E = np.zeros((10, 21, 2))
        while state1 != "terminal":
            action1 = Epsilon_greedy_policy(value, counter, state1)
            state2, reward = Step(state1, action1)
            idx1 = (state1.dealercard - 1, state1.playersum - 1, action1)
            Q1 = value[idx1]

            if state2 == "terminal":
                Q2 = 0.0
            else:
                action2 = Policy(value, counter, state2)
                idx2 = (state2.dealercard - 1, state2.playersum - 1, action2)
                Q2 = value[idx2]

            counter[idx1] += 1
            E[idx1] += 1

            alpha = 1.0 / counter[idx1]
            delta = reward + GAMMA * Q2 - Q1

            value += alpha * delta * E
            E *= GAMMA * lmbd

            state1 = state2

        error_history.append((episode, mse(value, opt_value)))

    return value, error_history
示例#4
0
def Sarsa():
    lmbd = [0.0,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
    #lmbd = [0.1]
    learning_curves = {}
    state1 = State()
    num_episodes = 20000

    with open("./Q_dump_episodes_1000000.pkl", "rb") as f:
        opt_value = pickle.load(f)
    for item in lmbd:
        #print("in main, item:",item)
        state1.dealercard = random.randint(1,10)
        state1.playersum = random.randint(1,10)
        #print("state in main:",state1.dealercard,state1.playersum)
        Q_value,error_history = Sarsa_lamda_Control(item,opt_value,num_episodes)
        learning_curves[item] = error_history
        #print("learning_curves:",learning_curves)


    plot_file = ("./outcome/Sarsa_error_{}_episodes_time_{}.pdf".format(20000,time.time()))
    plot_learning_curve(learning_curves, save=plot_file)
示例#5
0
文件: lfa.py 项目: zhan0903/easy21
def lfa_learn(lmbd, opt_value, num_episodes):
    #initialize
    Q = np.zeros((10, 21, 2))
    counter = np.zeros((10, 21, 2))
    totalreward = 0
    error_history = []
    w = (np.random.rand(*FEATS_SHAPE) - 0.5) * 0.001

    for episode in range(1, num_episodes + 1):
        # initialize env
        state1 = State()
        state1.dealercard = random.randint(1, 10)
        state1.playersum = random.randint(1, 10)
        #state1 = (state1.dealercard,state1.playersum)
        E = np.zeros_like(w)

        while state1 != "terminal":
            Qhat1, action1 = policy(state1, w)
            state2, reward = Step(state1, action1)
            Qhat2, action2 = policy(state2, w)

            feats1 = phi(state1, action1)
            grad_w_Qhat1 = feats1

            delta = reward + GAMMA * Qhat2 - Qhat1
            E = GAMMA * lmbd * E + grad_w_Qhat1
            dw = ALPHA * delta * E

            w += dw
            state1 = state2

            Q = expand_Q(w)
            #print("in lfa while")
            error_history.append((episode, mse(Q, opt_value)))

    return Q, error_history