def create_job(kwargs): import warnings warnings.filterwarnings("ignore") # pendulum env env = gym.make('Pendulum-TO-v0') env._max_episode_steps = 10000 env.unwrapped.dt = 0.02 env.unwrapped.umax = np.array([2.5]) env.unwrapped.periodic = False dm_state = env.observation_space.shape[0] dm_act = env.action_space.shape[0] state = env.reset() init_state = tuple([state, 1e-4 * np.eye(dm_state)]) solver = MBGPS(env, init_state=init_state, init_action_sigma=25., nb_steps=300, kl_bound=.1, action_penalty=1e-3, activation={ 'shift': 250, 'mult': 0.5 }) solver.run(nb_iter=100, verbose=False) solver.ctl.sigma = np.dstack([1e-1 * np.eye(dm_act)] * 300) data = solver.rollout(nb_episodes=1, stoch=True, init=state) obs, act = np.squeeze(data['x'], axis=-1).T, np.squeeze(data['u'], axis=-1).T return obs, act
def create_job(kwargs): import warnings warnings.filterwarnings("ignore") # cartpole env env = gym.make('Cartpole-TO-v0') env._max_episode_steps = 10000 env.unwrapped._dt = 0.01 dm_state = env.observation_space.shape[0] dm_act = env.action_space.shape[0] horizon, nb_steps = 50, 500 state = np.zeros((dm_state, nb_steps + 1)) action = np.zeros((dm_act, nb_steps)) state[:, 0] = env.reset() for t in range(nb_steps): solver = MBGPS(env, init_state=tuple( [state[:, t], 1e-16 * np.eye(dm_state)]), init_action_sigma=5., nb_steps=horizon, kl_bound=0.1) trace = solver.run(nb_iter=10, verbose=False) _nominal_action = solver.udist.mu action[:, t] = _nominal_action[:, 0] state[:, t + 1], _, _, _ = env.step(action[:, t]) print('Time Step:', t, 'Cost:', trace[-1]) return state[:, :-1].T, action.T
def create_job(kwargs): import warnings warnings.filterwarnings("ignore") # pendulum env env = gym.make('Pendulum-TO-v1') env._max_episode_steps = 10000 env.unwrapped._dt = 0.05 dm_state = env.observation_space.shape[0] dm_act = env.action_space.shape[0] horizon, nb_steps = 15, 150 state = np.zeros((dm_state, nb_steps + 1)) action = np.zeros((dm_act, nb_steps)) state[:, 0] = env.reset() for t in range(nb_steps): solver = MBGPS(env, init_state=tuple([state[:, t], 1e-4 * np.eye(dm_state)]), init_action_sigma=1., nb_steps=horizon, kl_bound=2.) trace = solver.run(nb_iter=10, verbose=False) _act = solver.ctl.sample(state[:, t], 0, stoch=False) action[:, t] = np.clip(_act, -env.ulim, env.ulim) state[:, t + 1], _, _, _ = env.step(action[:, t]) print('Time Step:', t, 'Cost:', trace[-1]) return state[:, :-1].T, action.T
def create_job(kwargs): import warnings warnings.filterwarnings("ignore") # pendulum env env = gym.make('Pendulum-TO-v0') env._max_episode_steps = 10000 env.unwrapped.dt = 0.02 env.unwrapped.umax = np.array([2.5]) env.unwrapped.periodic = True dm_state = env.observation_space.shape[0] dm_act = env.action_space.shape[0] horizon, nb_steps = 15, 100 state = np.zeros((dm_state, nb_steps + 1)) action = np.zeros((dm_act, nb_steps)) state[:, 0] = env.reset() for t in range(nb_steps): init_state = tuple([state[:, t], 1e-4 * np.eye(dm_state)]) solver = MBGPS(env, init_state=init_state, init_action_sigma=2.5, nb_steps=horizon, kl_bound=1., action_penalty=1e-3) trace = solver.run(nb_iter=5, verbose=False) solver.ctl.sigma = np.dstack([1e-2 * np.eye(dm_act)] * horizon) u = solver.ctl.sample(state[:, t], 0, stoch=True) action[:, t] = np.clip(u, -env.ulim, env.ulim) state[:, t + 1], _, _, _ = env.step(action[:, t]) # print('Time Step:', t, 'Cost:', trace[-1]) return state[:, :-1].T, action.T
def create_job(kwargs): import warnings warnings.filterwarnings("ignore") # cartpole env env = gym.make('Cartpole-TO-v0') env._max_episode_steps = 10000 env.unwrapped.dt = 0.05 env.unwrapped.umax = np.array([5.]) env.unwrapped.periodic = True dm_state = env.observation_space.shape[0] dm_act = env.action_space.shape[0] horizon, nb_steps = 20, 100 env_sigma = env.unwrapped.sigma state = np.zeros((dm_state, nb_steps + 1)) action = np.zeros((dm_act, nb_steps)) state[:, 0] = env.reset() for t in range(nb_steps): solver = MBGPS(env, init_state=tuple([state[:, t], env_sigma]), init_action_sigma=1., nb_steps=horizon, kl_bound=2., action_penalty=1e-5) trace = solver.run(nb_iter=10, verbose=False) _act = solver.ctl.sample(state[:, t], 0, stoch=False) action[:, t] = np.clip(_act, -env.ulim, env.ulim) state[:, t + 1], _, _, _ = env.step(action[:, t]) print('Time Step:', t, 'Cost:', trace[-1]) return state[:, :-1].T, action.T
import warnings warnings.filterwarnings("ignore") # pendulum env env = gym.make('QQube-TO-v1') env._max_episode_steps = 500 alg = MBGPS(env, nb_steps=500, kl_bound=0.01, init_ctl_sigma=100., activation={'shift': 450, 'mult': 0.5}) # run gps trace = alg.run(nb_iter=10, verbose=True) # plot dists alg.plot() # plot objective import matplotlib.pyplot as plt plt.figure() plt.plot(trace) plt.show() # sample and plot one trajectory data = alg.sample(nb_episodes=1, stoch=True) plt.figure()
# pendulum env env = gym.make('Pendulum-TO-v0') env._max_episode_steps = 100 env.unwrapped.dt = 0.05 env.seed(1337) solver = MBGPS(env, nb_steps=100, init_state=env.init(), init_action_sigma=5.0, kl_bound=1e1, slew_rate=False, action_penalty=1, activation={ 'mult': 1., 'shift': 80 }) trace = solver.run(nb_iter=25, verbose=True) # plot dists solver.plot() # plot objective plt.figure() plt.plot(trace) plt.show()
horizon, nb_steps = 100, 500 env_sigma = env.unwrapped._sigma state = np.zeros((dm_state, nb_steps + 1)) action = np.zeros((dm_act, nb_steps)) state[:, 0] = env.reset() for t in range(nb_steps): solver = MBGPS(env, init_state=tuple([state[:, t], env_sigma]), init_action_sigma=1., nb_steps=horizon, kl_bound=2.) trace = solver.run(nb_iter=10, verbose=False) _act = solver.ctl.sample(state[:, t], 0, stoch=False) action[:, t] = np.clip(_act, -env.ulim, env.ulim) state[:, t + 1], _, _, _ = env.step(action[:, t]) print('Time Step:', t, 'Cost:', trace[-1]) import matplotlib.pyplot as plt plt.figure() plt.subplot(7, 1, 1) plt.plot(state[0, :], '-b') plt.subplot(7, 1, 2) plt.plot(state[1, :], '-b')
import gym from trajopt.rgps import LRGPS from trajopt.gps import MBGPS import warnings warnings.filterwarnings("ignore") # lqr task env = gym.make('LQR-TO-v1') env.env._max_episode_steps = 100 # mass-damper rgps = LRGPS(env, nb_steps=100, policy_kl_bound=0.25, param_kl_bound=25e2, init_state=env.init(), init_action_sigma=100.) rgps.run(nb_iter=50, verbose=True) gps = MBGPS(env, nb_steps=100, init_state=env.init(), init_action_sigma=100., kl_bound=0.25) gps.run(nb_iter=100, verbose=True) rgps.compare(std_ctl=gps.ctl)
# @Date: 2019-06-16-18-38 # @Author: Hany Abdulsamad # @Contact: [email protected] import gym from trajopt.gps import MBGPS # double cartpole env env = gym.make('DoubleCartpole-TO-v0') env._max_episode_steps = 200 alg = MBGPS(env, nb_steps=200, kl_bound=10., init_ctl_sigma=5.0, activation=range(150, 200)) # run gps trace = alg.run() # plot dists alg.plot() # plot objective import matplotlib.pyplot as plt plt.figure() plt.plot(trace) plt.show()
warnings.filterwarnings("ignore") # lqr task env = gym.make('LQR-TO-v0') env.env._max_episode_steps = 100 dm_state = env.unwrapped.dm_state dm_act = env.unwrapped.dm_act mbgps = MBGPS(env, nb_steps=100, init_state=env.init(), init_action_sigma=100., kl_bound=5.) mbgps.run(nb_iter=15, verbose=True) riccati = Riccati(env, nb_steps=100, init_state=env.init()) riccati.run() np.random.seed(1337) env.seed(1337) gps_data = mbgps.rollout(250, stoch=False) np.random.seed(1337) env.seed(1337) riccati_data = riccati.rollout(250) print('GPS Cost: ', np.mean(np.sum(gps_data['c'], axis=0)), ', Riccati Cost', np.mean(np.sum(riccati_data['c'], axis=0)))
# @Date: 2019-06-16-18-38 # @Author: Hany Abdulsamad # @Contact: [email protected] import gym from trajopt.gps import MBGPS # cartpole env env = gym.make('Cartpole-TO-v1') env._max_episode_steps = 501 alg = MBGPS(env, nb_steps=200, kl_bound=1., init_ctl_sigma=5.0, activation=range(150, 200)) # run gps trace = alg.run(nb_iter=50) # plot dists alg.plot() # plot objective import matplotlib.pyplot as plt plt.figure() plt.plot(trace) plt.show()