def loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') # observation_space = env.observation_space.shape[0] run = 0 while True: run += 1 state = env.reset() step = 0 while True: step += 1 env.render() #TODO RUN PI ADJUST action = utils.policy(env, pilco, state, False) # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) # reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + str(run) + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
def plot_pilco_source_learning_curve(): env = gym.make('continuous-cartpole-v0') env.seed(73) pilcos = ['initial'] + [str(i) for i in range(6)] rewards = [] for i, p in enumerate(pilcos): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p), controller=controller, reward=R, sparse=False) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 xs = [] angles = [] while True: xs.append(state[0]) angles.append(state[2]) step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) plt.plot(xs, angles) plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight") plt.close() env.close() plt.plot([i for i, _ in enumerate(pilcos)], rewards) plt.savefig('pilco_rewards_plot', bbox_inches="tight") plt.close() return rewards, xs, angles
def load_and_run_model(env, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) print('Running {:s}'.format(name)) rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def piadjust(NT, name): controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env_S = gym.make('continuous-cartpole-v0') env_S.seed(73) env_T = gym.make('continuous-cartpole-v99') env_T.seed(73) D_S = sampler(pilco, env_S, samples_n=30, trials=50) # print('D_S sampling done') D_T = None pi_adj = pilco for i in range(NT): print('{:d}/{:d}'.format(i + 1, NT)) D_adj = [] if i == 0: D_i_T = sampler(pilco, env_T, samples_n=30) elif i != 0: D_i_T = sampler_adj(pi_adj, pilco, env_T, 30) if D_T is not None: D_T = np.concatenate((D_i_T, D_T)) elif D_T is None: D_T = D_i_T # print('Going for inverse dyn') gpr = inverse_dyn(D_T) # print('inverse dyn done') for samp in D_S: x_s = list(samp[0]) x_s1 = list(samp[2]) u_t_S = samp[1] a = np.array(x_s + x_s1).reshape(1, 8) u_t_T = gpr.predict(a, return_std=False) D_adj.append((x_s, u_t_S, u_t_T - u_t_S)) # print('Going for L3') pi_adj = L3(D_adj) # print('L3 Done') # i = i + 1 # if (i % 1 == 0): save_object(pi_adj, 'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i)) env_S.env.close() env_T.env.close() return pi_adj
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5', transfer_name='{:d}true_dyn_pi_adj.pkl', adjust=True): env = gym.make('continuous-cartpole-v99') env.seed(1) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco(pilco_name, controller=controller, reward=R, sparse=False) rewards = [] for i in range(10): print('Running {:s}'.format(transfer_name.format(i))) if adjust: with open(transfer_name.format(i), 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('Score for Model {:d}'.format(i)) state = env.reset() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) if adjust: pi_adjust_action = pi_adjust.predict( np.array(a).reshape(1, -1))[0] else: pi_adjust_action = 0 # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT state_next, reward, terminal, info = env.step(u_action + pi_adjust_action) reward = reward if not terminal else -reward state = state_next if terminal: print('Run: {:d}, score: {:d}'.format(i, step)) score_logger.add_score(step, i) break rewards.append(step) env.close() return rewards
def souce_loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) env = gym.make('continuous-cartpole-v99') pi_adjust = None score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 # env.render() # TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(u_action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() return (avg_reward)
def true_loader(name): env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) with open('9true_dyn_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) # with open('10_pi_adj.pkl', 'rb') as inp2: # good_pi = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 while True: run += 1 state = env.reset() # print(state) # input() step = 0 while True: step += 1 env.render() u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1))[0] state_next, reward, terminal, info = env.step(action + u_action) reward = reward if not terminal else -reward state = state_next if terminal: print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) break env.env.close()
if __name__ == '__main__': bf = 10 max_action = 1.0 target = np.array([0., 0., 0., 0.]) weights = np.diag([0.5, 0.1, 0.5, 0.25]) T = 400 state_dim = 4 control_dim = 1 with tf.Session() as sess: env = gym.make('continuous-cartpole-v0') controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-5', controller=controller, reward=R, sparse=False) pilco.act = lambda x: pilco.compute_action(x[None, :])[0, :] env_s = gym.make('continuous-cartpole-v0') env_t = gym.make('continuous-cartpole-v1') padjust(env_s, env_t, pilco)
def loader(name): Rs = np.empty(10).reshape(1, 10) env = gym.make('continuous-cartpole-v99') env.seed(73) controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action) R = ExponentialReward(state_dim=state_dim, t=target, W=weights) pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False) for pick in range(1, 11): env = gym.make('continuous-cartpole-v99') with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2: pi_adjust = pickle.load(inp2) score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS') run = 0 avg_reward = 0 while run != 101: run += 1 if (run % 20 == 0): print('run: ', run) state = env.reset() # print(state) # input() step = 0 while True: step += 1 #env.render() #TODO RUN PI ADJUST u_action = utils.policy(env, pilco, state, False) state_copy = state a = np.ndarray.tolist(state_copy) a.extend(np.ndarray.tolist(u_action)) action = pi_adjust.predict(np.array(a).reshape(1, -1)) action = action[0] if action[0] > 1: action[0] = 1 elif action[0] < -1: action[0] = -1 # TODO RUN PI ADJUST COMMENT THE NEXT LINE state_next, reward, terminal, info = env.step(action) reward = reward if not terminal else -reward state = state_next if terminal: # print("Run: " + ", score: " + str(step)) score_logger.add_score(step, run) avg_reward = avg_reward + step break avg_reward = avg_reward / run env.env.close() Rs[0][pick - 1] = avg_reward return (Rs)