Exemplo n.º 1
0
def setup_double_cartpole_experiment(params=None):
    # get experiment parameters
    if params is None:
        params = cartpole.default_params()

    # init environment
    env = double_cartpole.DoubleCartpole(**params['plant'])

    # init cost model
    cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])

    return env, cost, params
Exemplo n.º 2
0
def experiment1_params(n_rnd=1, n_opt=100, dynmodel_class=regression.SSGP_UI,
                       **kwargs):
    ''' pilco with rbf controller'''
    params = cartpole.default_params()
    params['n_rnd'] = int(n_rnd)
    params['n_opt'] = int(n_opt)
    params['plant']['maxU'] = params['policy']['maxU']
    for key in kwargs:
        if key in params:
            params[key] = eval(kwargs[key])
    params['dynmodel_class'] = dynmodel_class

    loss_kwargs = {}
    polopt_kwargs = {}
    extra_inps = []

    return params, loss_kwargs, polopt_kwargs, extra_inps
Exemplo n.º 3
0
from kusanagi import utils
from kusanagi.shell.cartpole import default_params#, CartpoleDraw
from kusanagi.shell.plant import SerialPlant
from kusanagi.ghost.algorithms.PILCO import PILCO, MC_PILCO
from kusanagi.ghost.control import NNPolicy
from kusanagi.utils import plot_results
#np.random.seed(31337)
np.set_printoptions(linewidth=500)

if __name__ == '__main__':
    # setup output directory
    utils.set_output_dir(os.path.join(utils.get_output_dir(),'cartpole_serial'))

    J = 4                                                                   # number of random initial trials
    N = 100                                                                 # learning iterations
    learner_params = default_params()
    
    # initialize learner
    learner_params['dynmodel_class'] = kreg.SSGP_UI
    learner_params['params']['dynmodel']['n_inducing'] = 100
    learner_params['plant_class'] = SerialPlant
    learner_params['params']['plant']['maxU'] = np.array(learner_params['params']['policy']['maxU'])*1.0/0.4
    learner_params['params']['plant']['state_indices'] = [0,2,3,1]
    learner_params['params']['plant']['baud_rate'] = 4000000
    learner_params['params']['plant']['port'] = '/dev/ttyACM0'
    #learner_params['min_method'] = 'ADAM'
    #learner_params['dynmodel_class'] = NN
    #learner_params['params']['dynmodel']['hidden_dims'] = [100,100,100]
    learner = PILCO(**learner_params)
    try:
        learner.load(load_compiled_fns=False)
def main_loop():
    parser = argparse.ArgumentParser()
    parser.add_argument("--env", type=str, choices=['cartpole', 'double_cartpole', 'pendulum'], default='cartpole')
    parser.add_argument("--discount_factor", type=float, default=.995)
    parser.add_argument("--gather_data_epochs", type=int, default=3, help='Epochs for initial data gather.')
    parser.add_argument("--train_hp_iterations", type=int, default=2000*10)
    parser.add_argument("--train_policy_batch_size", type=int, default=30)
    parser.add_argument("--no_samples", type=int, default=1)
    parser.add_argument("--basis_dim", type=int, default=256)
    parser.add_argument("--hidden_dim", type=int, default=32)
    parser.add_argument("--rffm_seed", type=int, default=1)
    parser.add_argument("--Agent", type=str, choices=['', '2'], default='')
    parser.add_argument("--max_train_hp_datapoints", type=int, default=20000)
    parser.add_argument("--update_hyperstate", type=int, default=1)
    parser.add_argument("--policy_use_hyperstate", type=int, default=1)
    parser.add_argument("--cma_maxiter", type=int, default=1000)
    parser.add_argument("--learn_diff", type=int, choices=[0, 1], default=0)
    parser.add_argument("--dump_model", type=int, choices=[0, 1], default=0)
    args = parser.parse_args()

    print(sys.argv)
    print(args)
    from blr_regression2_sans_hyperstate_kusanagi_multioutput import Agent2

    if args.env == 'cartpole':
        params = cartpole.default_params()
        cost = partial(cartpole.cartpole_loss, **params['cost'])
        env = cartpole.Cartpole(loss_func=cost, **params['plant'])
        max_steps = 25
        maxA = 10.
    elif args.env == 'double_cartpole':
        params = double_cartpole.default_params()
        cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])
        env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant'])
        max_steps = 30
        maxA = 20.
    elif args.env == 'pendulum':
        params = pendulum.default_params()
        cost = partial(pendulum.pendulum_loss, **params['cost'])
        env = pendulum.Pendulum(loss_func=cost, **params['plant'])
        max_steps = 40
        maxA = 2.5
    else:
        raise Exception('Unknown environment.')


    regression_wrapper_state = MultiOutputRegressionWrapper(input_dim=env.observation_space.shape[0]+env.action_space.shape[0],
                                                            output_dim=env.observation_space.shape[0],
                                                            basis_dim=args.basis_dim,
                                                            length_scale=1.,
                                                            signal_sd=1.,
                                                            noise_sd=5e-4,
                                                            prior_sd=1.,
                                                            rffm_seed=args.rffm_seed,
                                                            train_hp_iterations=args.train_hp_iterations)
    agent = eval('Agent'+args.Agent)(env=env,
                                     x_dim=env.observation_space.shape[0]+env.action_space.shape[0],
                                     y_dim=env.observation_space.shape[0],
                                     state_dim=env.observation_space.shape[0],
                                     action_dim=env.action_space.shape[0],
                                     observation_space_low=env.observation_space.low,
                                     observation_space_high=env.observation_space.high,
                                     action_space_low=np.array([-maxA]),
                                     action_space_high=np.array([maxA]),
                                     unroll_steps=max_steps,
                                     no_samples=args.no_samples,
                                     discount_factor=args.discount_factor,

                                     random_matrix_state=regression_wrapper_state.random_matrix,
                                     bias_state=regression_wrapper_state.bias,
                                     basis_dim_state=regression_wrapper_state.basis_dim,




                                     hidden_dim=args.hidden_dim,
                                     update_hyperstate=args.update_hyperstate,
                                     policy_use_hyperstate=args.policy_use_hyperstate,
                                     learn_diff=args.learn_diff,
                                     dump_model=args.dump_model)


    #I have to work on the classes before working on the code below.
    flag = False
    from utils import get_data3
    data_buffer = get_data3(env, trials=args.gather_data_epochs, max_steps=max_steps, maxA=maxA)

    init_states = np.stack([env.reset() for _ in range(args.train_policy_batch_size)], axis=0)


    for epoch in range(1000):
        #Train hyperparameters and update systems model.
        states_actions, states, rewards, next_states = unpack(data_buffer)

        next_states_train = next_states.copy() - states.copy() if args.learn_diff else next_states.copy()

        if flag == False:
            regression_wrapper_state._train_hyperparameters(states_actions, next_states_train)
            regression_wrapper_state._reset_statistics(states_actions, next_states_train)
        else:
            regression_wrapper_state._update(states_actions, next_states_train)

        if len(data_buffer) >= args.max_train_hp_datapoints: flag = True
        if flag: data_buffer = []
        tmp_data_buffer = []

        #Fit policy network.
        #XX, Xy, hyperparameters = zip(*[[rw.XX, rw.Xy, rw.hyperparameters] for rw in regression_wrappers])
        #eval('agent.'+args.fit_function)(args.cma_maxiter, np.copy(init_states), [np.copy(ele) for ele in XX], [np.copy(ele) for ele in Xy], [np.copy(ele) for ele in hyperparameters], sess)
        agent._fit(args.cma_maxiter,
                   init_states.copy(),
                   regression_wrapper_state.XX.copy(),
                   regression_wrapper_state.Xy.copy(),
                   regression_wrapper_state.hyperparameters.copy())

        #Get hyperstate & hyperparameters
        hyperstate_params = [regression_wrapper_state.Llower.copy()[None, ...],
                             regression_wrapper_state.Xy.copy()[None, ...]]
        total_rewards = 0.
        state = env.reset()
        steps = 0
        while True:
            #env.render()
            action = agent._forward(agent.thetas, state[np.newaxis, ...], hyperstate_params)[0]
            next_state, cost, done, _ = env.step(action)
            reward = -cost
            steps += 1

            hyperstate_params = update_hyperstate(agent,
                                                  hyperstate_params,
                                                  regression_wrapper_state.hyperparameters.copy(),
                                                  [state, action, reward, next_state, done],
                                                  args.learn_diff)

            tmp_data_buffer.append([state, action, reward, next_state, done])
            total_rewards += float(reward)
            state = next_state.copy()
            if done or steps >= max_steps:
                print('epoch:', epoch, 'total_rewards:', total_rewards)
                data_buffer.extend(tmp_data_buffer)
                break
Exemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()

    parser.add_argument("--env", type=str, choices=['cartpole', 'double_cartpole', 'pendulum'], default='cartpole')
    parser.add_argument("--train_hp_iterations", type=int, default=2000)
    parser.add_argument("--basis_dim", type=int, default=256)
    parser.add_argument("--basis_dim_reward", type=int, default=600)
    parser.add_argument("--matern_param", type=float, default=np.inf)
    parser.add_argument("--matern_param_reward", type=float, default=np.inf)
    parser.add_argument("--update_hyperstate", type=int, default=0)

    parser.add_argument("--trials", type=int, default=1)

    args = parser.parse_args()
    print(args)

    if args.env == 'cartpole':
        params = cartpole.default_params()
        cost = partial(cartpole.cartpole_loss, **params['cost'])
        env = cartpole.Cartpole(loss_func=cost, **params['plant'])
        max_steps = 25
        maxA = 10.
    elif args.env == 'double_cartpole':
        params = double_cartpole.default_params()
        cost = partial(double_cartpole.double_cartpole_loss, **params['cost'])
        env = double_cartpole.DoubleCartpole(loss_func=cost, **params['plant'])
        max_steps = 30
        maxA = 20.
    elif args.env == 'pendulum':
        params = pendulum.default_params()
        cost = partial(pendulum.pendulum_loss, **params['cost'])
        env = pendulum.Pendulum(loss_func=cost, **params['plant'])
        max_steps = 40
        maxA = 2.5
    else:
        raise Exception('Unknown environment.')


    states, actions, rewards, next_states = get_data2(env, trials=args.trials, max_steps=max_steps, maxA=maxA)
    states_actions = np.concatenate([states, actions], axis=-1)

    predictor = MultiOutputRegressionWrapper(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], output_dim=env.observation_space.shape[0], basis_dim=args.basis_dim, length_scale=1., signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=1, train_hp_iterations=args.train_hp_iterations, matern_param=args.matern_param)
    predictor._train_hyperparameters(states_actions, next_states)

    '''
    predictors = []
    for i in range(env.observation_space.shape[0]):
        predictors.append(RegressionWrapper2(input_dim=env.observation_space.shape[0]+env.action_space.shape[0], basis_dim=args.basis_dim, length_scale=1.,
                                          signal_sd=1., noise_sd=5e-4, prior_sd=1., rffm_seed=1, train_hp_iterations=args.train_hp_iterations, matern_param=args.matern_param))

    for i in range(env.observation_space.shape[0]):
        predictors[i]._train_hyperparameters(states_actions, next_states[:, i:i+1])
    '''

    while True:
        '''
        for i in range(env.observation_space.shape[0]):
            predictors[i]._reset_statistics(states_actions, next_states[:, i:i+1], bool(args.update_hyperstate))
        '''
        predictor._reset_statistics(states_actions, next_states)

        states2, actions2, rewards2, next_states2 = get_data2(env, trials=1, max_steps=max_steps, maxA=maxA)
        states_actions2 = np.concatenate([states2, actions2], axis=-1)

        plt.figure()

        predict_mu, predict_sigma = predictor._predict(states_actions2)
        for i in range(env.observation_space.shape[0]):
            plt.subplot(3, env.observation_space.shape[0], i+1)
            plt.plot(np.arange(len(next_states2[:, i:i+1])), next_states2[:, i:i+1])
            plt.errorbar(np.arange(len(predict_mu[:, i:i+1])), predict_mu[:, i:i+1], yerr=np.sqrt(predict_sigma), color='m', ecolor='g')
            plt.grid()

        '''
        for i in range(env.observation_space.shape[0]):
            plt.subplot(3, env.observation_space.shape[0], i+1)

            predict_mu, predict_sigma = predictors[i]._predict(states_actions2, False)

            plt.plot(np.arange(len(next_states2[:, i:i+1])), next_states2[:, i:i+1])
            plt.errorbar(np.arange(len(predict_mu)), predict_mu, yerr=np.sqrt(predict_sigma), color='m', ecolor='g')
            plt.grid()
        '''

        traj_reward = []
        traj = []
        no_lines = 50
        state = np.tile(np.copy(states2[0:1, ...]), [no_lines, 1])
        for a in actions2:
            action = np.tile(a[np.newaxis, ...], [no_lines, 1])
            state_action = np.concatenate([state, action], axis=-1)

            predict_mu, predict_sigma = predictor._predict(state_action)
            state = predict_mu + np.sqrt(predict_sigma) * np.random.normal(size=predict_mu.shape)
            '''
            mu_vec = []
            sigma_vec = []
            for i in range(env.observation_space.shape[0]):
                predict_mu, predict_sigma = predictors[i]._predict(state_action, bool(args.update_hyperstate))
                mu_vec.append(predict_mu)
                sigma_vec.append(predict_sigma)
            mu_vec = np.concatenate(mu_vec, axis=-1)
            sigma_vec = np.concatenate(sigma_vec, axis=-1)
            state = np.stack([np.random.multivariate_normal(mu, np.diag(sigma)) for mu, sigma in zip(mu_vec, sigma_vec)], axis=0)
            '''

            state = np.clip(state, env.observation_space.low, env.observation_space.high)
            traj.append(np.copy(state))

            reward = -env.loss_func(state)
            traj_reward.append(reward)

            '''
            for i in range(env.observation_space.shape[0]):
                predictors[i]._update_hyperstate(state_action, state[:, i:i+1], bool(args.update_hyperstate))
            '''

        traj_reward = np.stack(traj_reward, axis=-1)
        traj = np.stack(traj, axis=-1)
        
        plt.subplot(3, 1, 3)
        for j in range(no_lines):
            y = traj_reward[j, :]
            plt.plot(np.arange(len(y)), y, color='r')
        plt.plot(np.arange(len(rewards2)), rewards2)
        plt.grid()

        for i in range(env.observation_space.shape[0]):
            plt.subplot(3, env.observation_space.shape[0], env.observation_space.shape[0]+i+1)
            for j in range(no_lines):
                y = traj[j, i, :]
                plt.plot(np.arange(len(y)), y, color='r')

            plt.plot(np.arange(len(next_states2[..., i])), next_states2[..., i])
            plt.grid()

        plt.show(block=True)
Exemplo n.º 6
0
                              '/home/juancamilog/.kusanagi/output/cartpole/')
    #source_dir = os.path.join(base_dir,'examples/learned_policies/cartpole_serial')
    target_dir = os.path.join(base_dir,
                              'examples/learned_policies/target_sim2robot')
    # SOURCE DOMAIN
    utils.set_output_dir(source_dir)
    # load source experience
    source_experience = ExperienceDataset(
        filename='PILCO_SSGP_UI_6_4_Cartpole_RBFPolicy_sat_dataset')
    #source_experience = ExperienceDataset(filename='PILCO_SSGP_UI_SerialPlant_RBFPolicy_sat_dataset')
    #load source policy
    source_policy = RBFPolicy(filename='RBFPolicy_sat_5_1_cpu_float64')

    # TARGET DOMAIN
    utils.set_output_dir(target_dir)
    target_params = default_params()
    target_params['params']['H'] = 5.0  # control horizon
    target_params['params']['max_evals'] = 125
    # policy
    target_params['dynmodel_class'] = kreg.BNN  #SSGP_UI
    target_params['invdynmodel_class'] = kreg.BNN  #GP_UI
    target_params['params']['invdynmodel'] = {}
    target_params['params']['invdynmodel']['max_evals'] = 1000
    target_params['policy_class'] = AdjustedPolicy
    target_params['params']['policy']['adjustment_model_class'] = kreg.BNN
    #target_params['params']['policy']['adjustment_model_class'] = control.RBFPolicy
    #target_params['params']['policy']['n_inducing'] = 20
    target_params['params']['policy'][
        'sat_func'] = None  # this is because we probably need bigger controls for heavier pendulums
    target_params['params']['policy']['max_evals'] = 5000
    target_params['params']['policy']['m0'] = np.zeros(source_policy.D +