예제 #1
0
    def _learn(self, training_inputs, training_targets):
        assert self.ninducing_points

        # Subsampling the data if required.
        if self.nsampled_training_points:
            training_inputs, training_targets = self._subsample_training_set(
                training_inputs, training_targets)

        # Full GP if no inducing points are provided.
        self.pilco_ = PILCO(training_inputs, training_targets,
                            self.ninducing_points)
        self.pilco_.optimize_models(disp=True)
예제 #2
0
def load_pilco(path, controller=None, reward=None, sparse=False):
    X = np.loadtxt(path + 'X.csv', delimiter=',')
    Y = np.loadtxt(path + 'Y.csv', delimiter=',')
    if not sparse:
        pilco = PILCO((X, Y), controller=controller, reward=reward)
    else:
        with open(path + 'n_ind.txt', 'r') as f:
            n_ind = int(f.readline())
            f.close()
        pilco = PILCO((X, Y),
                      num_induced_points=n_ind,
                      controller=controller,
                      reward=reward)

    params = np.load(path + "pilco_values.npy", allow_pickle=True).item()
    # pilco.assign(params)
    for i, m in enumerate(pilco.mgpr.models):
        values = np.load(path + "model_" + str(i) + ".npy",
                         allow_pickle=True).item()
        # m.assign(values)
    return pilco
예제 #3
0
        X_, Y_, rewards = rollout(env=env,
                                  pilco=None,
                                  random=True,
                                  timesteps=40)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
        random_rewards.append(sum(rewards))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=5)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim)

    pilco = PILCO(X, Y, controller=controller, horizon=40)
    # Example of user provided reward function, setting a custom target state
    # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0]))
    # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)

    # Example of fixing a parameter, optional, for a linear controller only
    # pilco.controller.b = np.array([[0.0]])
    # pilco.controller.b.trainable = False

    for rollouts in range(3):
        pilco.optimize_models()
        pilco.optimize_policy()
        X_new, Y_new, _ = rollout(env=env, pilco=pilco, timesteps=100)

        # Update dataset
        X = np.vstack((X, X_new))
예제 #4
0
    # Initial random rollouts to generate a dataset
    X,Y = rollout(env, None, timesteps=T, random=True, SUBS=SUBS)
    for i in range(1,J):
        X_, Y_ = rollout(env, None, timesteps=T, random=True, SUBS=SUBS, verbose=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)

    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

    pilco = PILCO(X, Y, controller=controller, horizon=T, reward=R, m_init=m_init, S_init=S_init)

    # for numerical stability
    for model in pilco.mgpr.models:
        # model.kern.lengthscales.prior = gpflow.priors.Gamma(1,10) priors have to be included before
        # model.kern.variance.prior = gpflow.priors.Gamma(1.5,2)    before the model gets compiled
        model.likelihood.variance = 0.001
        model.likelihood.variance.trainable = False

    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(maxiter=maxiter, restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new = rollout(env, pilco, timesteps=T_sim, verbose=True, SUBS=SUBS)
예제 #5
0
def safe_swimmer_run(seed=0, logging=False):
    env = SwimmerWrapper()
    state_dim = 9
    control_dim = 2
    SUBS = 2
    maxiter = 60
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.05 * np.eye(state_dim)
    J = 1
    N = 12
    T = 25
    bf = 30
    T_sim = 100

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[0] = 0.5
    max_ang = (100 / 180 * np.pi) * 0.95
    R1 = LinearReward(state_dim, weights_l)

    C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False)
    C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False)
    C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False)
    R = CombinedRewards(state_dim, [R1, C1, C2, C3],
                        coefs=[1.0, -10.0, -10.0, -10.0])

    th = 0.2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=R,
                  m_init=m_init,
                  S_init=S_init)
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    new_data = True
    eval_runs = T_sim
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        if new_data:
            pilco.optimize_models(maxiter=100)
            new_data = False
        pilco.optimize_policy(maxiter=1, restarts=2)

        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risk1 = np.zeros(T)
        predicted_risk2 = np.zeros(T)
        predicted_risk3 = np.zeros(T)
        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :]
            predicted_risk1[h], _ = C1.compute_reward(m_h, S_h)
            predicted_risk2[h], _ = C2.compute_reward(m_h, S_h)
            predicted_risk3[h], _ = C3.compute_reward(m_h, S_h)
        estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1)
        estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2)
        estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3)
        overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * (
            1 - estimate_risk3)
        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco,
                                         timesteps=T_sim,
                                         verbose=True,
                                         SUBS=SUBS)
            new_data = True
            # Update dataset
            X = np.vstack((X, X_new[:T, :]))
            Y = np.vstack((Y, Y_new[:T, :]))
            pilco.mgpr.set_data((X, Y))
            if estimate_risk1 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0])
            if estimate_risk2 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0])
            if estimate_risk3 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75])
        else:
            print("*********CHANGING***********")
            if estimate_risk1 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0])
            if estimate_risk2 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0])
            if estimate_risk3 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5])
            _, _, r = pilco.predict(m_init, S_init, T)
        m_init = np.transpose(X[0, :-1, None])
        S_init = 0.1 * np.eye(state_dim)
        max_yerror = 0.3
        np.save(main_path + "/SAVED/data_mean_kn", data_mean)
        np.save(main_path + "/SAVED/data_std_kn", data_std)

        W = np.diag([1e-2, 1e-2, 1e-3, 1e-3])  # Weight of states
        T_sim = 600
        target = np.divide([0.0, 0.0, 0.0, 0.0] - data_mean, data_std)

        R, controller = pilcotrac.get_RewardFun_and_Controller(
            state_dim, control_dim, data_mean, data_std, max_yerror, W)

        pilco = PILCO(X,
                      Y,
                      num_induced_points=100,
                      controller=controller,
                      horizon=T,
                      reward=R)

        for model in pilco.mgpr.models:

            # model.clear()
            # model.kern.lengthscales.prior = gpflow.priors.Gamma(3,20) #priors have to be included before

            # model.kern.variance.prior = gpflow.priors.Gamma(1.5,4)    #before the model gets compiled
            model.likelihood.variance = 0.1
            model.likelihood.variance.trainable = False
            # model.compile()
            # vals = model.read_values()
            # print(vals)
예제 #7
0
                               render=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=R,
                  m_init=m_init,
                  S_init=S_init)

    # for numerical stability, we can set the likelihood variance parameters of the GP models
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    r_new = np.zeros((T, 1))
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(maxiter=maxiter, restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new, _, _ = rollout(env,
예제 #8
0
Y = np.divide(Y1, np.std(X1[:, :2], 0))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim
m_init = np.transpose(X[0, :-1, None])
S_init = 0.5 * np.eye(state_dim)
controller = RbfController(state_dim=state_dim,
                           control_dim=control_dim,
                           num_basis_functions=25)

R = ExponentialReward(state_dim=state_dim,
                      t=np.divide([0.5, 0.0] - env.m, env.std),
                      W=np.diag([0.5, 0.1]))
pilco = PILCO((X, Y),
              controller=controller,
              horizon=T,
              reward=R,
              m_init=m_init,
              S_init=S_init)

best_r = 0
all_Rs = np.zeros((X.shape[0], 1))
for i in range(len(all_Rs)):
    all_Rs[i, 0] = R.compute_reward(X[i, None, :-1],
                                    0.001 * np.eye(state_dim))[0]

ep_rewards = np.zeros((len(X) // T, 1))

for i in range(len(ep_rewards)):
    ep_rewards[i] = sum(all_Rs[i * T:i * T + T])

for model in pilco.mgpr.models:
예제 #9
0
# Initial random rollouts to generate a dataset
X, Y = rollout(policy=random_policy, timesteps=40)
for i in range(1, 3):
    X_, Y_ = rollout(policy=random_policy, timesteps=40)
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim
controller = RbfController(state_dim=state_dim,
                           control_dim=control_dim,
                           num_basis_functions=5)
#controller = LinearController(state_dim=state_dim, control_dim=control_dim)

pilco = PILCO(X, Y, controller=controller, horizon=20)

# Example of fixing a parameter, optional, for a linear controller only
#pilco.controller.b = np.array([[0.0]])
#pilco.controller.b.trainable = False

for rollouts in range(3):
    pilco.optimize()
    import pdb
    pdb.set_trace()
    X_new, Y_new = rollout(policy=pilco_policy, timesteps=100)
    # Update dataset
    X = np.vstack((X, X_new))
    Y = np.vstack((Y, Y_new))
    pilco.mgpr.set_XY(X, Y)
예제 #10
0
    for i in range(1,4): #No action sampling; u := 0
        X_, Y_ = rollout(env=env, pilco=None, random=None,  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)        


    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=18, max_action=env.action_space.high)
    #controller = LinearController(state_dim=state_dim, control_dim=control_dim)
    #print(X)
    #pilco = PILCO(X, Y, controller=controller, horizon=40, num_induced_points=72)
    
    # Example of user provided reward function, setting a custom target state
    R = ExponentialReward(state_dim=state_dim, t=target)
    pilco = PILCO(X, Y, controller=controller, horizon=20, reward=R, 
                        num_induced_points=72, m_init=m_init)

    # Example of fixing a parameter, optional, for a linear controller only
    #pilco.controller.b = np.array([[0.0]])
    #pilco.controller.b.trainable = False
    plot_model = False # Plot model set to false until iteration 5 is reached
    for rollouts in range(20):
        print("Optimizing models") 
        pilco.optimize_models()
        print("Optimizing controller")
        pilco.optimize_policy(maxiter=20)
        #import pdb; pdb.set_trace()
        X_new, Y_new = rollout(env=env, pilco=pilco, timesteps=120, SUBS=subs, render=False)
        print("No of ops:", len(tf.get_default_graph().get_operations()))
        # Update dataset
        X = np.vstack((X, X_new)).astype(np.float64)
def pilco_run(env,
              N,
              J,
              safe=False,
              name='',
              seed=0,
              cont=None,
              rew=None,
              SUBS=1,
              sim_timesteps=50,
              plan_timesteps=30,
              restarts=1,
              maxiter=100,
              m_init=None,
              S_init=None,
              fixed_noise=None,
              logging=False,
              eval_runs=5,
              eval_max_timesteps=None,
              variable_episode_length=False):
    np.random.seed(seed)
    X, Y, _, _ = rollout(env=env,
                         pilco=None,
                         timesteps=sim_timesteps,
                         random=True,
                         SUBS=SUBS)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env=env,
                               pilco=None,
                               timesteps=sim_timesteps,
                               random=True,
                               SUBS=SUBS)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    if cont is None:
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=5)
    elif cont['type'] == 'rbf':
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=cont['basis_functions'],
                                   max_action=cont.get('max_action', 1.0))
    elif cont['type'] == 'linear':
        controller = LinearController(state_dim=state_dim,
                                      control_dim=control_dim,
                                      max_action=cont.get('max_action', 1.0))
    else:
        ValueError('Invalid Controller')

    if rew is None:
        reward = None
    elif rew['type'] == 'exp':
        reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W'])
    else:
        ValueError('This function only handles Exponential rewards for now')

    pilco = PILCO((X, Y),
                  controller=controller,
                  reward=reward,
                  horizon=plan_timesteps,
                  m_init=m_init,
                  S_init=S_init)

    if fixed_noise is not None:
        for model in pilco.mgpr.models:
            model.likelihood.variance.assign(fixed_noise)
            set_trainable(model.likelihood.variance, False)

    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    if name == '':
        from datetime import datetime
        current_time = datetime.now()
        name = current_time.strftime("%d_%m_%Y_%H_%M_%S")
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models()
        pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=sim_timesteps,
                                     SUBS=SUBS,
                                     verbose=True)

        cur_rew = 0
        X = np.vstack((X, X_new[:plan_timesteps, :]))
        Y = np.vstack((Y, Y_new[:plan_timesteps, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0 and k == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists("results/" + name):
                os.makedirs("results/" + name)
            np.savetxt("results/" + name + "X_" + str(seed) + ".csv",
                       X,
                       delimiter=',')
            np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv",
                       X_eval,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_sampled_" +
                       str(seed) + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_full_" +
                       str(seed) + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
예제 #12
0
    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    # m_init =  np.transpose(X[0,:-1,None])#
    m_init = np.transpose(X[0,:-1,None]) 
    S_init = 0.05 * np.eye(state_dim)
    np.save(main_path+"/SAVED/v4_data_mean_5", data_mean)
    np.save(main_path+"/SAVED/v4_data_std_5", data_std)

    W = np.diag(np.array([1.0,0.5,0.1,0.1])) # Weight of states 
    T_sim = 600
    target = np.divide([0.0,0.0,0.0,0.0] -data_mean,data_std)

    R, controller = pilcotrac.get_RewardFun_and_Controller(state_dim,control_dim,data_mean,data_std,max_yerror,W)

    pilco = PILCO((X, Y), num_induced_points=100,controller=controller, horizon=T,reward=R,m_init=m_init,S_init=S_init) # num_induced_points: Number of sparse GP-pseudo-points
    
    for model in pilco.mgpr.models:


        # model.clear()
        # model.kern.lengthscales.prior = gpflow.priors.Gamma(3,20) # priors have to be included before  the model gets compiled

        # model.kern.variance.prior = gpflow.priors.Gamma(1.5,4)    
        model.likelihood.variance.assign(0.05)  # take a large noise to improve numerical stability (Cholesky decomposition failures)
        set_trainable(model.likelihood.variance, False)
        # model.compile()
        # vals = model.read_values()
        # print(vals)
    
예제 #13
0
파일: pilco_test.py 프로젝트: emsal0/gp-ros
                     render=False)
for i in range(1, J):
    X_, Y_, _, _ = rollout(env,
                           None,
                           timesteps=T,
                           verbose=False,
                           random=True,
                           SUBS=SUBS,
                           render=False)
    X = np.vstack((X, X_))
    Y = np.vstack((Y, Y_))

state_dim = Y.shape[1]
control_dim = X.shape[1] - state_dim
controller = RbfController(state_dim=state_dim,
                           control_dim=control_dim,
                           num_basis_functions=10,
                           max_action=max_action)
R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
pilco = PILCO((X, Y),
              controller=controller,
              horizon=T,
              reward=R,
              m_init=m_init,
              S_init=S_init)

pilco.optimize_models(maxiter=100)
pilco.optimize_policy(maxiter=20)

X_new, Y_new, _, _ = rollout(env, pilco, timesteps=T, SUBS=SUBS, render=False)
예제 #14
0
class PilcoDynamicsLearner(DynamicsLearnerInterface):
    def __init__(self,
                 history_length,
                 prediction_horizon,
                 ninducing_points,
                 nsampled_training_points=None,
                 averaging=True):
        super().__init__(history_length,
                         prediction_horizon,
                         averaging=averaging)
        self.ninducing_points = ninducing_points
        self.nsampled_training_points = nsampled_training_points

    def _learn(self, training_inputs, training_targets):
        assert self.ninducing_points

        # Subsampling the data if required.
        if self.nsampled_training_points:
            training_inputs, training_targets = self._subsample_training_set(
                training_inputs, training_targets)

        # Full GP if no inducing points are provided.
        self.pilco_ = PILCO(training_inputs, training_targets,
                            self.ninducing_points)
        self.pilco_.optimize_models(disp=True)

    def _predict(self, inputs):
        assert self.pilco_, "a trained model must be available"
        prediction = []
        for model in self.pilco_.mgpr.models:
            means, _ = model.predict_f(inputs)
            prediction.append(means)
        return np.hstack(prediction)

    def _subsample_training_set(self, training_inputs, training_targets):
        assert self.nsampled_training_points
        total_size = training_inputs.shape[0]
        permutation = np.random.permutation(
            total_size)[:self.nsampled_training_points]
        return training_inputs[permutation], training_targets[permutation]

    def name(self):
        return "pilco"

    def load(self, filename):
        params_dict = np.load(filename)
        for k in params_dict.keys():
            print(k, params_dict[k].shape)
        raise NotImplementedError  # TODO: parse the hyperparameters.

    def save(self, filename):
        """
        Stores the hyperparameters of the GPs which includes inducing points
        in the case of sparse approximations.
        """
        params_dict = defaultdict(list)
        for model in self.pilco_.mgpr.models:
            params = model.read_trainables()
            for key in params.keys():
                params_dict[key].append(params[key])
        np.savez(filename, **params_dict)
예제 #15
0
        X.append(np.hstack((x, u)))
        Y.append(x_new - x)
        x = x_new
    return np.stack(X), np.stack(Y)


# Initial random rollouts to generate a dataset
def random_policy(x):
    return env.action_space.sample()/5
X1, Y1 = rollout(policy=random_policy, timesteps=40)
X2, Y2 = rollout(policy=random_policy, timesteps=40)
X = np.vstack((X1, X2))
Y = np.vstack((Y1, Y2))

# Define PILCO on three rollouts
pilco = PILCO(X, Y, horizon=50)
# Example of fixing a parameter
pilco.controller.b = np.array([[0.0]])
pilco.controller.b.trainable = False

def pilco_policy(x):
    return pilco.compute_action(x[None, :])[0, :]

for rollouts in range(2):
    pilco.optimize()
    import pdb; pdb.set_trace()
    X_new, Y_new = rollout(policy=pilco_policy, timesteps=150)
    # Update dataset
    X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
    pilco.mgpr.set_XY(X, Y)
예제 #16
0
def swimmer_run(name, seed):
    env = gym.make('Swimmer-v2').env
    #env = SwimmerWrapper()
    state_dim = 8
    control_dim = 2
    SUBS = 5
    maxiter = 120
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.005 * np.eye(state_dim)
    J = 10
    N = 15
    T = 15
    bf = 40
    T_sim = 50

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[3] = 1.0
    max_ang = 95 / 180 * np.pi
    t1 = np.zeros(state_dim)
    t1[2] = max_ang
    w1 = 1e-6 * np.eye(state_dim)
    w1[2, 2] = 10
    t2 = np.zeros(state_dim)
    t2[1] = max_ang
    w2 = 1e-6 * np.eye(state_dim)
    w2[1, 1] = 10
    t5 = np.zeros(state_dim)
    #t5[0] = max_ang
    #w3 = 1e-6 * np.eye(state_dim)
    #w3[0,0] = 5
    t3 = np.zeros(state_dim)
    t3[2] = -max_ang
    t4 = np.zeros(state_dim)
    t4[1] = -max_ang
    #t6 = np.zeros(state_dim); t6[0] = -max_ang
    R2 = LinearReward(state_dim, weights_l)
    R3 = ExponentialReward(state_dim, W=w1, t=t1)
    R4 = ExponentialReward(state_dim, W=w2, t=t2)
    R5 = ExponentialReward(state_dim, W=w1, t=t3)
    R6 = ExponentialReward(state_dim, W=w2, t=t4)
    #R7 = ExponentialReward(state_dim, W=w3, t=t5)
    #R8 = ExponentialReward(state_dim, W=w3, t=t6)
    Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6],
                          coefs=[1.0, -1.0, -1.0, -1.0, -1.0])
    # Rew = R2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True,
                         render=False)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True,
                               render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=Rew,
                  m_init=m_init,
                  S_init=S_init)
    #for model in pilco.mgpr.models:
    #    model.likelihood.variance = 0.0001
    #    model.likelihood.variance.trainable = False

    logging = True  # To save results in .csv files turn this flag to True
    eval_runs = 10
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    eval_max_timesteps = 1000 // SUBS
    X_eval = False
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=T_sim,
                                     verbose=True,
                                     SUBS=SUBS,
                                     render=False)

        cur_rew = 0
        for t in range(0, len(X_new)):
            cur_rew += Rew.compute_reward(
                X_new[t, 0:state_dim, None].transpose(),
                0.0001 * np.eye(state_dim))[0]
            if t == T:
                print(
                    'On this episode, on the planning horizon, PILCO reward was: ',
                    cur_rew)
        print('On this episode PILCO reward was ', cur_rew)

        gym_steps = 1000
        T_eval = gym_steps // SUBS
        # Update dataset
        X = np.vstack((X, X_new[:T, :]))
        Y = np.vstack((Y, Y_new[:T, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists(name):
                os.makedirs(name)
            np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',')
            np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',')
            np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt(name + "evaluation_returns_full_" + seed + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
                                  random=True,
                                  timesteps=T_sim,
                                  render=False,
                                  verbose=verbose)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean

    # controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=40)
    controller = LinearController(state_dim=state_dim, control_dim=control_dim)

    pilco = PILCO((X, Y), controller=controller, horizon=T, m_init=m_init)
    pilco.controller.max_action = e

    # for numerical stability
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)
        # model.likelihood.fixed=True

    return_lst = []
    task_length = []
    for rollouts in range(100):
        print("**** ITERATION no.", rollouts, " ****")
        try:
            pilco.optimize_models(maxiter=maxiter)
            pilco.optimize_policy(maxiter=maxiter)
예제 #18
0
X = []
Y = []
for rollouts in range(5):
    env.reset()
    x, _, _, _ = env.step(0)
    for i in range(50):
        env.render()
        u = env.action_space.sample() / 5
        x_, _, _, _ = env.step(u)

        X.append(np.hstack((x, u)))
        Y.append(x_ - x)
        x = x_

pilco = PILCO(np.stack(X), np.stack(Y))
pilco.controller.t.trainable = False
pilco.controller.b.trainable = False
pilco.optimize()

import pdb
pdb.set_trace()
env.reset()
x, _, _, _ = env.step(0)
for i in range(50):
    env.render()
    u = pilco.compute_action(x[None, :])
    x_, _, _, _ = env.step(u)

    X.append(np.hstack((x, u[0, :])))
    Y.append(x_ - x)