def main(args): mode = args[1] weight_out = args[2] returns_out = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) beta = 0.96 tmp = 0.0 vt = np.array([], dtype="float64") returns_list = np.array([], dtype="float64") env = MountainCar(mode) S_size = env.state_space A_size = env.action_space W = np.zeros([S_size, A_size], dtype="float64") # print(W.shape) b = 0 parameters = {"W": W, "b": b} with open(returns_out, "w") as fout: for i in range(episodes): env.reset() state = env.transform(env.state) # print(state) returns = 0.0 done = False for j in range(max_iterations): Q = Q_calculation(state, parameters) # print(Q) a = find_action(epsilon, Q, A_size) grads, reward, state, done = grads_calculation( parameters, state, a, env, Q, gamma) parameters = update(grads, parameters, learning_rate) returns += reward if done != False: break returns_list = np.append(returns_list, returns) fout.write(str(returns) + "\n") tmp = (beta * tmp + (1 - beta) * returns) tmp1 = tmp / (1 - beta**(i + 1)) vt = np.append(vt, tmp1) # print(vt) x = range(1, episodes + 1) m = plt.plot(x, returns_list) n = plt.plot(x, vt) plt.legend(('Returns', 'Rolling Mean'), loc='upper left') plt.title("tile mode: returns and rolling mean") plt.ylabel("returns & rolling mean") plt.xlabel("epochs") plt.show() write_weights(parameters, weight_out)
def q_learning(mode, w_out, r_out, epis, max_iter, eps, gamma, lr): epis = int(epis) max_iter = int(max_iter) eps = float(eps) gamma = float(gamma) lr = float(lr) env = MountainCar(mode) n_state = env.state_space n_action = env.action_space w = np.zeros((n_state, n_action), dtype=np.longdouble) b = 0 rewards_sum = np.zeros((epis, 1), dtype=np.longdouble) for i in np.arange(epis): reward_cum = 0 for j in np.arange(max_iter): s_dict = env.transform(env.state) s = state_mode(mode, s_dict, n_state) q = np.dot(s, w) + b rand = np.random.binomial(1, eps, 1)[0] if (rand == 0): a = np.argmax(q) else: a = np.random.randint(n_action, size=1)[0] s1_dict, reward, terminate = env.step(a) s1 = state_mode(mode, s1_dict, n_state) q1 = np.dot(s1, w) + b w[:, a] -= lr * (q[a] - reward - gamma * np.max(q1)) * s b -= lr * (q[a] - reward - gamma * np.max(q1)) reward_cum += reward if (terminate == True): break s_dict = env.reset() rewards_sum[i, 0] = reward_cum pars = np.insert(w.reshape((n_state * n_action, 1)), 0, b, axis=0) np.savetxt(w_out, pars, fmt="%f") np.savetxt(r_out, rewards_sum, fmt="%f")
def main(args): mode = str(sys.argv[1]) weight_out = sys.argv[2] returns_out = sys.argv[3] num_episodes = int(sys.argv[4]) max_iterations = int(sys.argv[5]) epsilon = float(sys.argv[6]) discount_factor = float(sys.argv[7]) learning_rate = float(sys.argv[8]) #define function for performing greedy search for picking action def greedy(state, weight, action_space): Q_list = [] for each in range(0, (action_space)): Q = 0 for k, v in state.items(): Q += v * weight[k, each] Q += b Q_list.append(Q) a = np.argmax(Q_list) max_Q = max(Q_list) return Q, a, max_Q #define function to calculate q after selecting action def q_calc(state, weight, a, b): q = 0 for k, v in state.items(): q += v * weight[k, a] q += b return q #define function to update the weights def update(state, action_space, weight, learning_rate, q, reward, discount_factor, max_Q): for each in range(0, (action_space)): for k, v in state.items(): if each == a: weight[k, each] = weight[k, each] - (learning_rate * ( (q - (reward + (discount_factor * max_Q))))) * v return weight env = MountainCar(mode) #call the environment weight = np.zeros((env.state_space, env.action_space)) #initialize weights b = 0 #initialize bias returns_out = open(sys.argv[3], 'w') for e in range(0, num_episodes): #iterating over the number of episodes env.reset() #reset reward = 0 #initialize reward for it in range( 0, max_iterations): #iterating over number of max iterations state = env.state #initialize state state = env.transform(state) #transform to dictionary action_space = env.action_space #call action space probabilty = np.random.uniform(0.0, 1.0) if probabilty < epsilon: a = np.random.randint(0, 3) #random search for a else: _, a, _ = greedy(state, weight, action_space) #greedy search for a s_next, reward_next, done = env.step( a ) #compute the next state, reward for chosen action. If done = TRUE, stop. reward = reward + reward_next #update reward q = q_calc(state, weight, a, b) #calculate q for the chosen action(a) _, a_next, max_Q = greedy( s_next, weight, action_space) #calculate max_Q for the next state weight = update(state, action_space, weight, learning_rate, q, reward_next, discount_factor, max_Q) #update weights b = b - (learning_rate * (q - (reward_next + (discount_factor * max_Q)))) #update bias if done: break #break when done = TRUE returns_out.write(str(reward) + "\n") #print rewards for each episode output_list = [] output_list.append(b) for w in weight: for each in w: output_list.append(each) with open(sys.argv[2], 'w') as f: for item in output_list: f.write("%s\n" % item) #print final bias and weights pass
def main(args): mode = args[1] weight_out = args[2] # print(mode, weight_out) returns_out = args[3] episodes = int(args[4]) max_iterations = int(args[5]) epsilon = float(args[6]) gamma = float(args[7]) learning_rate = float(args[8]) car = MountainCar(mode) # return car # mode = sys. argv[0] # weight_out = sys. argv[1] # returns_out = sys. argv[2] # episodes = int(sys. argv[3]) # max_iterations = int(sys. argv[4]) # epsilon = float(sys. argv[5]) # gamma = float(sys. argv[6]) # learning_rate = float(sys. argv[7]) # mode = 'tile' # max_iterations = 200 # episodes = 4 # epsilon = 0.05 # gamma = 0.99 # learning_rate = 0.01 # returns_out = 'r.txt' # weight_out = 'w.txt' # car = main(sys.argv) returns_out = open(returns_out,"w") weight_out = open(weight_out,"w") return_out_raw = '' weight_out_raw = '' # if mode == 'raw..': # # a = car.step(0) # bias = 0 # w = np.zeros((2,3)) # def calc_q(state,action): # qsaw = state[0]*w[0][action] + state[1]*w[1][action] + bias # # print('(-----)') # # print(state[0]) # # print(w[0][action]) # # print(state[1]) # # print(w[1][action]) # # print(bias) # # print('(-----)') # return qsaw # for i in range(episodes): # reward = 0 # car.reset() # e = random.random() # if e <= epsilon: # c = np.random.randint(0,3) # else: # c = 0 # # c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)])) # a0 = car.state # a = car.step(c) # d = np.array([calc_q(a[0],j) for j in range(3)]) # # print(d) # # print(w[:,c]) # # print(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d)))) # # print([a[0][0],a[0][1]]) # # print(np.multiply(learning_rate*(calc_q(a[0],c)-(a[1]+gamma*np.max(d))),[a[0][0],a[0][1]])) # # print('st') # qsa = calc_q(a0,c) # w[:,c] = w[:,c]- learning_rate*np.multiply((qsa-(a[1]+gamma*np.max(d))),[a0[0],a0[1]]) # bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # # print(a0) # # print(c) # # print(calc_q(a0,c)) # # print(a[1]) # # print(gamma*np.max(d)) # # print((calc_q(a0,c)-(a[1]+gamma*np.max(d)))) # # print('b ' + str(bias)) # # print(w[:,c]) # reward += a[1] # while a[2] == False and abs(reward)<max_iterations: # e = random.random() # if e <= epsilon: # c = np.random.randint(0,3) # else: # c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)])) # # print(c) # a0 = a # a = car.step(c) # d = np.array([calc_q(a[0],j) for j in range(3)]) # qsa = calc_q(a0[0],c) # w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),[a0[0][0],a0[0][1]]) # bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # # print('b ' + str(bias)) # reward += a[1] # return_out_raw += str(reward) + '\n' # weight_out_raw += str(bias) + '\n' # for i in w: # for j in i: # weight_out_raw += str(j) + '\n' # else: # # mode == 'tile': if mode == 'tile': s = 2048 else: s = 2 bias = 0 w = np.zeros((s,3)) def calc_q(state,action): qsaw = bias for i in state: qsaw += state[i]*w[i][action] return qsaw for i in range(episodes): reward = 0 car.reset() a0 = car.transform(car.state) e = random.random() if e <= epsilon: c = np.random.randint(0,3) else: c = np.argmax(np.array([calc_q(a0,j) for j in range(3)])) a = car.step(c) d = np.array([calc_q(a[0],j) for j in range(3)]) qsa = calc_q(a0,c) kk = np.zeros((1,s)) for k in a0: # kk[0][k] = 1 kk[0][k] = a0[k] # print(kk) w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk) bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # print(bias) # print(qsa) # print(a[1]+gamma*np.max(d)) # print(bias) reward += a[1] while a[2] == False and abs(reward)<max_iterations: e = random.random() if e <= epsilon: c = np.random.randint(0,3) else: c = np.argmax(np.array([calc_q(a[0],j) for j in range(3)])) # print(c) a0 = a a = car.step(c) d = np.array([calc_q(a[0],j) for j in range(3)]) qsa = calc_q(a0[0],c) kk = np.zeros((1,s)) for k in a0[0]: kk[0][k] = a0[0][k] # kk[0][k] = 1 w[:,c] = w[:,c]- learning_rate*np.multiply(qsa-(a[1]+gamma*np.max(d)),kk) bias = bias - learning_rate*(qsa-(a[1]+gamma*np.max(d))) # print('b ' + str(bias)) reward += a[1] return_out_raw += str(reward) + '\n' weight_out_raw += str(bias) + '\n' for i in w: for j in i: weight_out_raw += str(j) + '\n' # print(return_out_raw) # print(weight_out_raw) returns_out.writelines(return_out_raw) weight_out.writelines(weight_out_raw)
def main(args): mode = args[1] env = MountainCar(mode) env.reset() print(env.transform(env.state)) print(env.reset())