for idn in range(len(n)): # define simulation length Nsteps = 500 # define starting state x0 = np.array([-8, 9]) # load model predictor nb_states = 2 nb_actions = 1 wb_model = WB_Model(dt) # create the SUSD learner Q = np.eye(2) R = np.array([1]) K0 = np.array([0.1, 0.1]) N_rollout = n[idn] T_rollout = 20 S = SUSD(wb_model, Q, R, K0, N_rollout, T_rollout, alpha=0.35, term_len=100) # estimate optimal K using SUSD conveged, iters = S.search(x0, max_iter=100) K = S.K.reshape((nb_actions, nb_states)) print "SUSD Search took", iters, "iterations" print "Estimated K:"; print K z_list[idn] = S.z_buf cost_plot(z_list, n) plt.show()
else: Ct += np.sum(x*np.matmul(Q, x), axis=0) if nb_actions == 1: Ct += (u*R*u).flatten() else: Ct += np.sum(u*np.matmul(R, u), axis=0) C_lqr += Ct # simulate forward in time x = WB_model.predict(x, u).flatten() # estimate optimal K using PG print "Search using Natural Policy Gradient..." converged, PG_iters = PG.search(x0, r=1) K_PG = PG.K.reshape((nb_actions, nb_states)) # estiamte optimal K using SUSD print "Search using SUSD..." conveged, S_iters = S.search(x0, r=1) K_S = S.K.reshape((nb_actions, nb_states)) print "LQR Gains:"; print -K_lqr print "Natural Policy Gradient Search took", PG_iters, "iterations" print "Natural Policy Gradient Gains:"; print K_PG print "SUSD Search took", S_iters, "iterations" print "SUSD Gains:"; print K_S # plot the cost results cost_plot(PG.z_buf, S.z_buf, N_trajectories, N_agents, int(T_rollout/dt), C_lqr) plt.show()
Q, R, K0, N_agents, T_rollout, dt=dt, alpha=0.1, term_len=200) # compute the optimal K A, B = WB_model.system() sys = control.StateSpace(A, B, np.eye(4), np.zeros((4, 2)), 1) K_lqr, _, _ = control.lqr(sys, Q, R) # estiamte optimal K using SUSD conveged, iters = S.search(x0, r=0.001, max_iter=100000) K = S.K.reshape((nb_actions, nb_states)) print "SUSD Search took", iters, "iterations" #print "Estimated K:", K #print "LQR K:", -K_lqr print "Error K:", K - K_lqr # simulate the trajectory x = np.zeros([nb_states, Nsteps]) x_lqr = np.zeros([nb_states, Nsteps]) x[:, 0] = x0.flatten() x_lqr[:, 0] = x0.flatten() for t in range(Nsteps - 1): u = np.dot(K, x[:, t]) x[:, t + 1] = WB_model.predict(x[:, t].reshape(-1, 1),
#print "Finite Difference Gains:", K # estimate optimal K using RS converged, iters = RS.search(x0.copy(), max_iter=250) K = RS.K.reshape((nb_actions, nb_states)) print "Random Search took", iters, "iterations" print "Random Search Gains:", K # estimate optimal K using PG converged, iters = PG.search(x0.copy(), r=0.5, epsilon=0.1, max_iter=250) K = PG.K.reshape((nb_actions, nb_states)) print "Natural Policy Gradient Search took", iters, "iterations" print "Natural Policy Gradient Gains:", K # estiamte optimal K using SUSD conveged, iters = S.search(x0.copy(), r=0.5, max_iter=250) K = S.K.reshape((nb_actions, nb_states)) print "SUSD Search took", iters, "iterations" print "SUSD Gains:", K C_lqr = 0 x = x0.copy() Nsteps = int(T_rollout / dt) for t in range(Nsteps - 1): # compute input u = np.dot(-K_lqr, x) # compute cost Ct = 0 if nb_states == 1: Ct += (x * Q * x).flatten() else: Ct += np.sum(x * np.matmul(Q, x), axis=0)
if nb_actions == 1: Ct += (u * R * u).flatten() else: Ct += np.sum(u * np.matmul(R, u), axis=0) C_lqr += Ct # simulate forward in time x = WB_model.predict(x, u).flatten() # estimate optimal K using RS print "Search using RS..." converged, RS_iters = RS.search(x0.copy(), max_iter=400) K_RS = RS.K.reshape((nb_actions, nb_states)) # estiamte optimal K using SUSD print "Search using SUSD..." conveged, S_iters = S.search(x0, r=0.1, max_iter=400) K_S = S.K.reshape((nb_actions, nb_states)) # estimate optimal K using PG print "Search using Natural Policy Gradient..." converged, PG_iters = PG.search(x0, r=1, epsilon=0.1, max_iter=400) K_PG = PG.K.reshape((nb_actions, nb_states)) print "LQR Gains:" print -K_lqr print "Random Search took", RS_iters, "iterations" print "Random Search Gains:" print K_RS print "Natural Policy Gradient Search took", PG_iters, "iterations" print "Natural Policy Gradient Gains:" print K_PG
# create the SUSD learner Q = np.eye(2) R = np.array([1]) K0 = np.array([-10., -10.]) N_rollout = 3 T_rollout = 30 S = SUSD(WB_model, Q, R, K0, N_rollout, T_rollout) # compute the optimal K A, B = WB_model.system() K_lqr, _, _ = control.lqr(A, B, Q, R) # estiamte optimal K using SUSD conveged, iters = S.search(x0) K = S.K.reshape((nb_actions, nb_states)) print "SUSD Search took", iters, "iterations" print "Estimated K:", K print "LQR K:", -K_lqr # simulate the trajectory x = np.zeros([nb_states, Nsteps]) x_lqr = np.zeros([nb_states, Nsteps]) x[:, 0] = x0.flatten() x_lqr[:, 0] = x0.flatten() for t in range(Nsteps - 1): u = np.dot(K, x[:, t]) x[:, t + 1] = WB_model.predict(x[:, t].reshape(-1, 1), u.reshape(-1, 1)).flatten()