示例#1
0
def main(args):
    mode = args[1]
    weight_out = args[2]
    returns_out = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])

    beta = 0.96
    tmp = 0.0
    vt = np.array([], dtype="float64")
    returns_list = np.array([], dtype="float64")

    env = MountainCar(mode)
    S_size = env.state_space
    A_size = env.action_space
    W = np.zeros([S_size, A_size], dtype="float64")
    # print(W.shape)
    b = 0
    parameters = {"W": W, "b": b}

    with open(returns_out, "w") as fout:
        for i in range(episodes):
            env.reset()
            state = env.transform(env.state)
            # print(state)
            returns = 0.0
            done = False
            for j in range(max_iterations):
                Q = Q_calculation(state, parameters)
                # print(Q)
                a = find_action(epsilon, Q, A_size)
                grads, reward, state, done = grads_calculation(
                    parameters, state, a, env, Q, gamma)
                parameters = update(grads, parameters, learning_rate)
                returns += reward

                if done != False:
                    break
            returns_list = np.append(returns_list, returns)
            fout.write(str(returns) + "\n")
            tmp = (beta * tmp + (1 - beta) * returns)
            tmp1 = tmp / (1 - beta**(i + 1))

            vt = np.append(vt, tmp1)
    # print(vt)

    x = range(1, episodes + 1)
    m = plt.plot(x, returns_list)
    n = plt.plot(x, vt)
    plt.legend(('Returns', 'Rolling Mean'), loc='upper left')
    plt.title("tile mode: returns and rolling mean")
    plt.ylabel("returns & rolling mean")
    plt.xlabel("epochs")
    plt.show()

    write_weights(parameters, weight_out)
def q_learning(mode, w_out, r_out, epis, max_iter, eps, gamma, lr):
    epis = int(epis)
    max_iter = int(max_iter)
    eps = float(eps)
    gamma = float(gamma)
    lr = float(lr)
    env = MountainCar(mode)
    n_state = env.state_space
    n_action = env.action_space
    w = np.zeros((n_state, n_action), dtype=np.longdouble)
    b = 0
    rewards_sum = np.zeros((epis, 1), dtype=np.longdouble)

    for i in np.arange(epis):
        reward_cum = 0
        for j in np.arange(max_iter):
            s_dict = env.transform(env.state)
            s = state_mode(mode, s_dict, n_state)
            q = np.dot(s, w) + b
            rand = np.random.binomial(1, eps, 1)[0]
            if (rand == 0):
                a = np.argmax(q)
            else:
                a = np.random.randint(n_action, size=1)[0]

            s1_dict, reward, terminate = env.step(a)
            s1 = state_mode(mode, s1_dict, n_state)
            q1 = np.dot(s1, w) + b
            w[:, a] -= lr * (q[a] - reward - gamma * np.max(q1)) * s
            b -= lr * (q[a] - reward - gamma * np.max(q1))
            reward_cum += reward
            if (terminate == True):
                break

        s_dict = env.reset()

        rewards_sum[i, 0] = reward_cum

    pars = np.insert(w.reshape((n_state * n_action, 1)), 0, b, axis=0)
    np.savetxt(w_out, pars, fmt="%f")
    np.savetxt(r_out, rewards_sum, fmt="%f")
示例#3
0
def main(args):
    mode = str(sys.argv[1])
    weight_out = sys.argv[2]
    returns_out = sys.argv[3]
    num_episodes = int(sys.argv[4])
    max_iterations = int(sys.argv[5])
    epsilon = float(sys.argv[6])
    discount_factor = float(sys.argv[7])
    learning_rate = float(sys.argv[8])

    #define function for performing greedy search for picking action
    def greedy(state, weight, action_space):
        Q_list = []
        for each in range(0, (action_space)):
            Q = 0
            for k, v in state.items():
                Q += v * weight[k, each]
            Q += b
            Q_list.append(Q)
        a = np.argmax(Q_list)
        max_Q = max(Q_list)
        return Q, a, max_Q

    #define function to calculate q after selecting action
    def q_calc(state, weight, a, b):
        q = 0
        for k, v in state.items():
            q += v * weight[k, a]
        q += b
        return q

    #define function to update the weights
    def update(state, action_space, weight, learning_rate, q, reward,
               discount_factor, max_Q):
        for each in range(0, (action_space)):
            for k, v in state.items():
                if each == a:
                    weight[k, each] = weight[k, each] - (learning_rate * (
                        (q - (reward + (discount_factor * max_Q))))) * v
        return weight

    env = MountainCar(mode)  #call the environment
    weight = np.zeros((env.state_space, env.action_space))  #initialize weights
    b = 0  #initialize bias
    returns_out = open(sys.argv[3], 'w')
    for e in range(0, num_episodes):  #iterating over the number of episodes
        env.reset()  #reset
        reward = 0  #initialize reward
        for it in range(
                0, max_iterations):  #iterating over number of max iterations
            state = env.state  #initialize state
            state = env.transform(state)  #transform to dictionary
            action_space = env.action_space  #call action space
            probabilty = np.random.uniform(0.0, 1.0)
            if probabilty < epsilon:
                a = np.random.randint(0, 3)  #random search for a
            else:
                _, a, _ = greedy(state, weight,
                                 action_space)  #greedy search for a
            s_next, reward_next, done = env.step(
                a
            )  #compute the next state, reward for chosen action. If done = TRUE, stop.
            reward = reward + reward_next  #update reward
            q = q_calc(state, weight, a,
                       b)  #calculate q for the chosen action(a)
            _, a_next, max_Q = greedy(
                s_next, weight,
                action_space)  #calculate max_Q for the next state
            weight = update(state, action_space, weight, learning_rate, q,
                            reward_next, discount_factor,
                            max_Q)  #update weights
            b = b - (learning_rate *
                     (q - (reward_next +
                           (discount_factor * max_Q))))  #update bias
            if done:
                break  #break when done = TRUE
        returns_out.write(str(reward) + "\n")  #print rewards for each episode

    output_list = []
    output_list.append(b)
    for w in weight:
        for each in w:
            output_list.append(each)
    with open(sys.argv[2], 'w') as f:
        for item in output_list:
            f.write("%s\n" % item)  #print final bias and weights
    pass
示例#4
0
def main(args):
    mode = args[1]
    weight_out = args[2]
#     print(mode, weight_out)
    returns_out = args[3]
    episodes = int(args[4])
    max_iterations = int(args[5])
    epsilon = float(args[6])
    gamma = float(args[7])
    learning_rate = float(args[8])
    car = MountainCar(mode)
#     return car
    

#     mode = sys. argv[0]
#     weight_out = sys. argv[1]
#     returns_out = sys. argv[2]
#     episodes = int(sys. argv[3])
#     max_iterations = int(sys. argv[4])
#     epsilon = float(sys. argv[5])
#     gamma = float(sys. argv[6])
#     learning_rate = float(sys. argv[7])

#     mode = 'tile'
#     max_iterations = 200
#     episodes = 4
#     epsilon = 0.05
#     gamma = 0.99
#     learning_rate = 0.01
#     returns_out = 'r.txt'
#     weight_out = 'w.txt'


#     car = main(sys.argv)



    returns_out = open(returns_out,"w") 
    weight_out = open(weight_out,"w") 

    return_out_raw = ''
    weight_out_raw = ''


#     if mode == 'raw..':


#     #     a = car.step(0)
#         bias = 0
#         w = np.zeros((2,3)) 


#         def calc_q(state,action):
#             qsaw = state[0]*w[0][action] + state[1]*w[1][action] + bias
#     #         print('(-----)')
#     #         print(state[0])
#     #         print(w[0][action])
#     #         print(state[1])
#     #         print(w[1][action])
#     #         print(bias)
#     #         print('(-----)')

#             return qsaw

#         for i in range(episodes):
#             reward = 0
#             car.reset()

#             e = random.random()
#             if e <= epsilon:
#                 c = np.random.randint(0,3)
#             else:
#                 c = 0
#     #             c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)]))
#             a0 = car.state
#             a = car.step(c)
#             d = np.array([calc_q(a[0],j) for j in range(3)])
#     #         print(d)
#     #         print(w[:,c])
#     #         print(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d))))
#     #         print([a[0][0],a[0][1]])
#     #         print(np.multiply(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d))),[a[0][0],a[0][1]]))
#     #         print('st')
#             qsa = calc_q(a0,c)
#             w[:,c] = w[:,c]- learning_rate*np.multiply((qsa-(a[1]+gamma*np.max(d))),[a0[0],a0[1]])
#             bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))
#     #         print(a0)
#     #         print(c)
#     #         print(calc_q(a0,c))
#     #         print(a[1])
#     #         print(gamma*np.max(d))
#     #         print((calc_q(a0,c)-(a[1]+gamma*np.max(d))))
#     #         print('b ' + str(bias))

#     #         print(w[:,c])
#             reward += a[1]

#             while a[2] == False and abs(reward)<max_iterations:
#                 e = random.random()
#                 if e <= epsilon:
#                     c = np.random.randint(0,3)
#                 else:
#                     c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)]))
#     #             print(c)
#                 a0 = a
#                 a = car.step(c)
#                 d = np.array([calc_q(a[0],j) for j in range(3)])
#                 qsa = calc_q(a0[0],c)
#                 w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),[a0[0][0],a0[0][1]])
#                 bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))
#     #             print('b ' + str(bias))

#                 reward += a[1]
#             return_out_raw += str(reward) + '\n'


#         weight_out_raw += str(bias) + '\n'
#         for i in w:
#             for j in i:
#                 weight_out_raw += str(j) + '\n'

#     else:
# #     mode == 'tile':
    
    if mode == 'tile':
        s = 2048
    else:
        s = 2
    bias = 0
    w = np.zeros((s,3)) 



    def calc_q(state,action):
        qsaw = bias
        for i in state:
            qsaw += state[i]*w[i][action]
        return qsaw

    for i in range(episodes):
        reward = 0
        car.reset()

        a0 = car.transform(car.state)
        e = random.random()
        if e <= epsilon:
            c = np.random.randint(0,3)
        else:
            c = np.argmax(np.array([calc_q(a0,j) for j in range(3)]))


        a = car.step(c)
        d = np.array([calc_q(a[0],j) for j in range(3)])
        qsa = calc_q(a0,c)
        kk = np.zeros((1,s))
        for k in a0:
#                 kk[0][k] = 1
            kk[0][k] = a0[k]
#         print(kk)
        w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk)
        bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))
#         print(bias)
#         print(qsa)
#         print(a[1]+gamma*np.max(d))
#         print(bias)

        reward += a[1]

        while a[2] == False and abs(reward)<max_iterations:
            e = random.random()
            if e <= epsilon:
                c = np.random.randint(0,3)
            else:
                c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)]))
#             print(c)
            a0 = a
            a = car.step(c)
            d = np.array([calc_q(a[0],j) for j in range(3)])
            qsa = calc_q(a0[0],c)
            kk = np.zeros((1,s))
            for k in a0[0]:
                kk[0][k] = a0[0][k]
#                     kk[0][k] = 1
            w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk)
            bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d)))


#             print('b ' + str(bias))

            reward += a[1]
        return_out_raw += str(reward) + '\n'


    weight_out_raw += str(bias) + '\n'
    for i in w:
        for j in i:
            weight_out_raw += str(j) + '\n'



#     print(return_out_raw)
#     print(weight_out_raw)

    
    returns_out.writelines(return_out_raw)
    weight_out.writelines(weight_out_raw)
示例#5
0
def main(args):
    mode = args[1]
    env = MountainCar(mode)
    env.reset()
    print(env.transform(env.state))
    print(env.reset())