def setup_double_cartpole_experiment(params=None): # get experiment parameters if params is None: params = cartpole.default_params() # init environment env = double_cartpole.DoubleCartpole(**params['plant']) # init cost model cost = partial(double_cartpole.double_cartpole_loss, **params['cost']) return env, cost, params
def experiment1_params(n_rnd=1, n_opt=100, dynmodel_class=regression.SSGP_UI, **kwargs): ''' pilco with rbf controller''' params = cartpole.default_params() params['n_rnd'] = int(n_rnd) params['n_opt'] = int(n_opt) params['plant']['maxU'] = params['policy']['maxU'] for key in kwargs: if key in params: params[key] = eval(kwargs[key]) params['dynmodel_class'] = dynmodel_class loss_kwargs = {} polopt_kwargs = {} extra_inps = [] return params, loss_kwargs, polopt_kwargs, extra_inps
from kusanagi import utils from kusanagi.shell.cartpole import default_params#, CartpoleDraw from kusanagi.shell.plant import SerialPlant from kusanagi.ghost.algorithms.PILCO import PILCO, MC_PILCO from kusanagi.ghost.control import NNPolicy from kusanagi.utils import plot_results #np.random.seed(31337) np.set_printoptions(linewidth=500) if __name__ == '__main__': # setup output directory utils.set_output_dir(os.path.join(utils.get_output_dir(),'cartpole_serial')) J = 4 # number of random initial trials N = 100 # learning iterations learner_params = default_params() # initialize learner learner_params['dynmodel_class'] = kreg.SSGP_UI learner_params['params']['dynmodel']['n_inducing'] = 100 learner_params['plant_class'] = SerialPlant learner_params['params']['plant']['maxU'] = np.array(learner_params['params']['policy']['maxU'])*1.0/0.4 learner_params['params']['plant']['state_indices'] = [0,2,3,1] learner_params['params']['plant']['baud_rate'] = 4000000 learner_params['params']['plant']['port'] = '/dev/ttyACM0' #learner_params['min_method'] = 'ADAM' #learner_params['dynmodel_class'] = NN #learner_params['params']['dynmodel']['hidden_dims'] = [100,100,100] learner = PILCO(**learner_params) try: learner.load(load_compiled_fns=False)
def main_loop(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, choices=['cartpole', 'double_cartpole', 'pendulum'], default='cartpole') parser.add_argument("--discount_factor", type=float, default=.995) parser.add_argument("--gather_data_epochs", type=int, default=3, help='Epochs for initial data gather.') parser.add_argument("--train_hp_iterations", type=int, default=2000*10) parser.add_argument("--train_policy_batch_size", type=int, default=30) parser.add_argument("--no_samples", type=int, default=1) parser.add_argument("--basis_dim", type=int, default=256) parser.add_argument("--hidden_dim", type=int, default=32) parser.add_argument("--rffm_seed", type=int, default=1) parser.add_argument("--Agent", type=str, choices=['', '2'], default='') parser.add_argument("--max_train_hp_datapoints", type=int, default=20000) parser.add_argument("--update_hyperstate", type=int, default=1) parser.add_argument("--policy_use_hyperstate", type=int, default=1) parser.add_argument("--cma_maxiter", type=int, default=1000) parser.add_argument("--learn_diff", type=int, choices=[0, 1], default=0) parser.add_argument("--dump_model", type=int, choices=[0, 1], default=0) args = parser.parse_args() print(sys.argv) print(args) from blr_regression2_sans_hyperstate_kusanagi_multioutput import Agent2 if args.env == 'cartpole': params = cartpole.default_params() cost = partial(cartpole.cartpole_loss, **params['cost']) env = cartpole.Cartpole(loss_func=cost, **params['plant']) max_steps = 25 maxA = 10. elif args.env == 'double_cartpole': params = double_cartpole.default_params() cost = partial(double_cartpole.double_cartpole_loss, **params['cost']) env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant']) max_steps = 30 maxA = 20. elif args.env == 'pendulum': params = pendulum.default_params() cost = partial(pendulum.pendulum_loss, **params['cost']) env = pendulum.Pendulum(loss_func=cost, **params['plant']) max_steps = 40 maxA = 2.5 else: raise Exception('Unknown environment.') regression_wrapper_state = MultiOutputRegressionWrapper(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], output_dim=env.observation_space.shape[0], basis_dim=args.basis_dim, length_scale=1., signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=args.rffm_seed, train_hp_iterations=args.train_hp_iterations) agent = eval('Agent'+args.Agent)(env=env, x_dim=env.observation_space.shape[0]+env.action_space.shape[0], y_dim=env.observation_space.shape[0], state_dim=env.observation_space.shape[0], action_dim=env.action_space.shape[0], observation_space_low=env.observation_space.low, observation_space_high=env.observation_space.high, action_space_low=np.array([-maxA]), action_space_high=np.array([maxA]), unroll_steps=max_steps, no_samples=args.no_samples, discount_factor=args.discount_factor, random_matrix_state=regression_wrapper_state.random_matrix, bias_state=regression_wrapper_state.bias, basis_dim_state=regression_wrapper_state.basis_dim, hidden_dim=args.hidden_dim, update_hyperstate=args.update_hyperstate, policy_use_hyperstate=args.policy_use_hyperstate, learn_diff=args.learn_diff, dump_model=args.dump_model) #I have to work on the classes before working on the code below. flag = False from utils import get_data3 data_buffer = get_data3(env, trials=args.gather_data_epochs, max_steps=max_steps, maxA=maxA) init_states = np.stack([env.reset() for _ in range(args.train_policy_batch_size)], axis=0) for epoch in range(1000): #Train hyperparameters and update systems model. states_actions, states, rewards, next_states = unpack(data_buffer) next_states_train = next_states.copy() - states.copy() if args.learn_diff else next_states.copy() if flag == False: regression_wrapper_state._train_hyperparameters(states_actions, next_states_train) regression_wrapper_state._reset_statistics(states_actions, next_states_train) else: regression_wrapper_state._update(states_actions, next_states_train) if len(data_buffer) >= args.max_train_hp_datapoints: flag = True if flag: data_buffer = [] tmp_data_buffer = [] #Fit policy network. #XX, Xy, hyperparameters = zip(*[[rw.XX, rw.Xy, rw.hyperparameters] for rw in regression_wrappers]) #eval('agent.'+args.fit_function)(args.cma_maxiter, np.copy(init_states), [np.copy(ele) for ele in XX], [np.copy(ele) for ele in Xy], [np.copy(ele) for ele in hyperparameters], sess) agent._fit(args.cma_maxiter, init_states.copy(), regression_wrapper_state.XX.copy(), regression_wrapper_state.Xy.copy(), regression_wrapper_state.hyperparameters.copy()) #Get hyperstate & hyperparameters hyperstate_params = [regression_wrapper_state.Llower.copy()[None, ...], regression_wrapper_state.Xy.copy()[None, ...]] total_rewards = 0. state = env.reset() steps = 0 while True: #env.render() action = agent._forward(agent.thetas, state[np.newaxis, ...], hyperstate_params)[0] next_state, cost, done, _ = env.step(action) reward = -cost steps += 1 hyperstate_params = update_hyperstate(agent, hyperstate_params, regression_wrapper_state.hyperparameters.copy(), [state, action, reward, next_state, done], args.learn_diff) tmp_data_buffer.append([state, action, reward, next_state, done]) total_rewards += float(reward) state = next_state.copy() if done or steps >= max_steps: print('epoch:', epoch, 'total_rewards:', total_rewards) data_buffer.extend(tmp_data_buffer) break
def main(): parser = argparse.ArgumentParser() parser.add_argument("--env", type=str, choices=['cartpole', 'double_cartpole', 'pendulum'], default='cartpole') parser.add_argument("--train_hp_iterations", type=int, default=2000) parser.add_argument("--basis_dim", type=int, default=256) parser.add_argument("--basis_dim_reward", type=int, default=600) parser.add_argument("--matern_param", type=float, default=np.inf) parser.add_argument("--matern_param_reward", type=float, default=np.inf) parser.add_argument("--update_hyperstate", type=int, default=0) parser.add_argument("--trials", type=int, default=1) args = parser.parse_args() print(args) if args.env == 'cartpole': params = cartpole.default_params() cost = partial(cartpole.cartpole_loss, **params['cost']) env = cartpole.Cartpole(loss_func=cost, **params['plant']) max_steps = 25 maxA = 10. elif args.env == 'double_cartpole': params = double_cartpole.default_params() cost = partial(double_cartpole.double_cartpole_loss, **params['cost']) env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant']) max_steps = 30 maxA = 20. elif args.env == 'pendulum': params = pendulum.default_params() cost = partial(pendulum.pendulum_loss, **params['cost']) env = pendulum.Pendulum(loss_func=cost, **params['plant']) max_steps = 40 maxA = 2.5 else: raise Exception('Unknown environment.') states, actions, rewards, next_states = get_data2(env, trials=args.trials, max_steps=max_steps, maxA=maxA) states_actions = np.concatenate([states, actions], axis=-1) predictor = MultiOutputRegressionWrapper(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], output_dim=env.observation_space.shape[0], basis_dim=args.basis_dim, length_scale=1., signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=1, train_hp_iterations=args.train_hp_iterations, matern_param=args.matern_param) predictor._train_hyperparameters(states_actions, next_states) ''' predictors = [] for i in range(env.observation_space.shape[0]): predictors.append(RegressionWrapper2(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], basis_dim=args.basis_dim, length_scale=1., signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=1, train_hp_iterations=args.train_hp_iterations, matern_param=args.matern_param)) for i in range(env.observation_space.shape[0]): predictors[i]._train_hyperparameters(states_actions, next_states[:, i:i+1]) ''' while True: ''' for i in range(env.observation_space.shape[0]): predictors[i]._reset_statistics(states_actions, next_states[:, i:i+1], bool(args.update_hyperstate)) ''' predictor._reset_statistics(states_actions, next_states) states2, actions2, rewards2, next_states2 = get_data2(env, trials=1, max_steps=max_steps, maxA=maxA) states_actions2 = np.concatenate([states2, actions2], axis=-1) plt.figure() predict_mu, predict_sigma = predictor._predict(states_actions2) for i in range(env.observation_space.shape[0]): plt.subplot(3, env.observation_space.shape[0], i+1) plt.plot(np.arange(len(next_states2[:, i:i+1])), next_states2[:, i:i+1]) plt.errorbar(np.arange(len(predict_mu[:, i:i+1])), predict_mu[:, i:i+1], yerr=np.sqrt(predict_sigma), color='m', ecolor='g') plt.grid() ''' for i in range(env.observation_space.shape[0]): plt.subplot(3, env.observation_space.shape[0], i+1) predict_mu, predict_sigma = predictors[i]._predict(states_actions2, False) plt.plot(np.arange(len(next_states2[:, i:i+1])), next_states2[:, i:i+1]) plt.errorbar(np.arange(len(predict_mu)), predict_mu, yerr=np.sqrt(predict_sigma), color='m', ecolor='g') plt.grid() ''' traj_reward = [] traj = [] no_lines = 50 state = np.tile(np.copy(states2[0:1, ...]), [no_lines, 1]) for a in actions2: action = np.tile(a[np.newaxis, ...], [no_lines, 1]) state_action = np.concatenate([state, action], axis=-1) predict_mu, predict_sigma = predictor._predict(state_action) state = predict_mu + np.sqrt(predict_sigma) * np.random.normal(size=predict_mu.shape) ''' mu_vec = [] sigma_vec = [] for i in range(env.observation_space.shape[0]): predict_mu, predict_sigma = predictors[i]._predict(state_action, bool(args.update_hyperstate)) mu_vec.append(predict_mu) sigma_vec.append(predict_sigma) mu_vec = np.concatenate(mu_vec, axis=-1) sigma_vec = np.concatenate(sigma_vec, axis=-1) state = np.stack([np.random.multivariate_normal(mu, np.diag(sigma)) for mu, sigma in zip(mu_vec, sigma_vec)], axis=0) ''' state = np.clip(state, env.observation_space.low, env.observation_space.high) traj.append(np.copy(state)) reward = -env.loss_func(state) traj_reward.append(reward) ''' for i in range(env.observation_space.shape[0]): predictors[i]._update_hyperstate(state_action, state[:, i:i+1], bool(args.update_hyperstate)) ''' traj_reward = np.stack(traj_reward, axis=-1) traj = np.stack(traj, axis=-1) plt.subplot(3, 1, 3) for j in range(no_lines): y = traj_reward[j, :] plt.plot(np.arange(len(y)), y, color='r') plt.plot(np.arange(len(rewards2)), rewards2) plt.grid() for i in range(env.observation_space.shape[0]): plt.subplot(3, env.observation_space.shape[0], env.observation_space.shape[0]+i+1) for j in range(no_lines): y = traj[j, i, :] plt.plot(np.arange(len(y)), y, color='r') plt.plot(np.arange(len(next_states2[..., i])), next_states2[..., i]) plt.grid() plt.show(block=True)
'/home/juancamilog/.kusanagi/output/cartpole/') #source_dir = os.path.join(base_dir,'examples/learned_policies/cartpole_serial') target_dir = os.path.join(base_dir, 'examples/learned_policies/target_sim2robot') # SOURCE DOMAIN utils.set_output_dir(source_dir) # load source experience source_experience = ExperienceDataset( filename='PILCO_SSGP_UI_6_4_Cartpole_RBFPolicy_sat_dataset') #source_experience = ExperienceDataset(filename='PILCO_SSGP_UI_SerialPlant_RBFPolicy_sat_dataset') #load source policy source_policy = RBFPolicy(filename='RBFPolicy_sat_5_1_cpu_float64') # TARGET DOMAIN utils.set_output_dir(target_dir) target_params = default_params() target_params['params']['H'] = 5.0 # control horizon target_params['params']['max_evals'] = 125 # policy target_params['dynmodel_class'] = kreg.BNN #SSGP_UI target_params['invdynmodel_class'] = kreg.BNN #GP_UI target_params['params']['invdynmodel'] = {} target_params['params']['invdynmodel']['max_evals'] = 1000 target_params['policy_class'] = AdjustedPolicy target_params['params']['policy']['adjustment_model_class'] = kreg.BNN #target_params['params']['policy']['adjustment_model_class'] = control.RBFPolicy #target_params['params']['policy']['n_inducing'] = 20 target_params['params']['policy'][ 'sat_func'] = None # this is because we probably need bigger controls for heavier pendulums target_params['params']['policy']['max_evals'] = 5000 target_params['params']['policy']['m0'] = np.zeros(source_policy.D +