Пример #1
0
def test_rbf():
    np.random.seed(0)
    d = 3  # Input dimension
    k = 2  # Number of outputs
    b = 100  # basis functions

    # Training Dataset
    X0 = np.random.rand(100, d)
    A = np.random.rand(d, k)
    Y0 = np.sin(X0).dot(A) + 1e-3 * (np.random.rand(100, k) - 0.5
                                     )  #  Just something smooth
    rbf = RbfController(3, 2, b)
    rbf.set_XY(X0, Y0)

    # Generate input
    m = np.random.rand(1, d)  # But MATLAB defines it as m'
    s = np.random.rand(d, d)
    s = s.dot(s.T)  # Make s positive semidefinite

    M, S, V = rbf.compute_action(m, s, squash=False)

    print("M\n", M)
    print("S\n", S)
    print("V\n", V)
    # convert data to the struct expected by the MATLAB implementation
    lengthscales = rbf.model.covar_module.lengthscale.cpu().detach().numpy(
    ).squeeze()
    variance = 1 * np.ones(k)
    noise = rbf.model.likelihood.noise.cpu().detach().numpy().squeeze()

    hyp = np.log(
        np.hstack((lengthscales, np.sqrt(variance[:, None]),
                   np.sqrt(noise[:, None])))).T

    gpmodel = oct2py.io.Struct()
    gpmodel.hyp = hyp
    gpmodel.inputs = X0
    gpmodel.targets = Y0

    # Call gp0 in octave
    M_mat, S_mat, V_mat = octave.gp2(gpmodel, m.T, s, nout=3)

    assert M.shape == M_mat.T.shape
    assert S.shape == S_mat.shape
    assert V.shape == V_mat.shape
    np.testing.assert_allclose(M, M_mat.T, rtol=1e-4)
    np.testing.assert_allclose(S, S_mat, rtol=1e-4)
    np.testing.assert_allclose(V, V_mat, rtol=1e-4)
def load_and_run_model(env, name):
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    print('Running {:s}'.format(name))
    rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
def loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)

    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    # observation_space = env.observation_space.shape[0]

    run = 0
    while True:
        run += 1
        state = env.reset()
        step = 0
        while True:
            step += 1
            env.render()

            #TODO RUN PI ADJUST
            action = utils.policy(env, pilco, state, False)
            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(action)
            # reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Пример #4
0
def plot_pilco_source_learning_curve():
    env = gym.make('continuous-cartpole-v0')
    env.seed(73)

    pilcos = ['initial'] + [str(i) for i in range(6)]

    rewards = []
    for i, p in enumerate(pilcos):
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(p),
                           controller=controller,
                           reward=R,
                           sparse=False)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0

        xs = []
        angles = []

        while True:
            xs.append(state[0])
            angles.append(state[2])
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

        plt.plot(xs, angles)
        plt.savefig('pilco-{:d}_states_plot'.format(i), bbox_inches="tight")
        plt.close()

    env.close()

    plt.plot([i for i, _ in enumerate(pilcos)], rewards)
    plt.savefig('pilco_rewards_plot', bbox_inches="tight")
    plt.close()

    return rewards, xs, angles
def piadjust(NT, name):
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name), controller=controller, reward=R, sparse=False)

    env_S = gym.make('continuous-cartpole-v0')
    env_S.seed(73)

    env_T = gym.make('continuous-cartpole-v99')
    env_T.seed(73)

    D_S = sampler(pilco, env_S, samples_n=30, trials=50)
    # print('D_S sampling done')

    D_T = None
    pi_adj = pilco

    for i in range(NT):
        print('{:d}/{:d}'.format(i + 1, NT))

        D_adj = []

        if i == 0:
            D_i_T = sampler(pilco, env_T, samples_n=30)
        elif i != 0:
            D_i_T = sampler_adj(pi_adj, pilco, env_T, 30)

        if D_T is not None:
            D_T = np.concatenate((D_i_T, D_T))
        elif D_T is None:
            D_T = D_i_T

        # print('Going for inverse dyn')
        gpr = inverse_dyn(D_T)
        # print('inverse dyn done')

        for samp in D_S:
            x_s = list(samp[0])
            x_s1 = list(samp[2])
            u_t_S = samp[1]

            a = np.array(x_s + x_s1).reshape(1, 8)
            u_t_T = gpr.predict(a, return_std=False)

            D_adj.append((x_s, u_t_S, u_t_T - u_t_S))

        # print('Going for L3')
        pi_adj = L3(D_adj)
        # print('L3 Done')

        # i = i + 1
        # if (i % 1 == 0):
        save_object(pi_adj, 'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i))

    env_S.env.close()
    env_T.env.close()

    return pi_adj
Пример #6
0
def test_rbf():
    np.random.seed(0)
    d = 3  # Input dimension
    k = 2  # Number of outputs
    b = 100  # basis functions

    # Training Dataset
    X0 = np.random.rand(100, d)
    A = np.random.rand(d, k)
    Y0 = np.sin(X0).dot(A) + 1e-3 * (np.random.rand(100, k) - 0.5
                                     )  #  Just something smooth
    rbf = RbfController(3, 2, b)
    rbf.set_XY(X0, Y0)

    # Generate input
    m = np.random.rand(1, d)  # But MATLAB defines it as m'
    s = np.random.rand(d, d)
    s = s.dot(s.T)  # Make s positive semidefinite

    M, S, V = compute_action_wrapper(rbf, m, s)

    # convert data to the struct expected by the MATLAB implementation
    lengthscales = np.stack(
        [model.kern.lengthscales.value for model in rbf.models])
    variance = np.stack([model.kern.variance.value for model in rbf.models])
    noise = np.stack([model.likelihood.variance.value for model in rbf.models])

    hyp = np.log(
        np.hstack((lengthscales, np.sqrt(variance[:, None]),
                   np.sqrt(noise[:, None])))).T

    gpmodel = oct2py.io.Struct()
    gpmodel.hyp = hyp
    gpmodel.inputs = X0
    gpmodel.targets = Y0

    # Call gp0 in octave
    M_mat, S_mat, V_mat = octave.gp2(gpmodel, m.T, s, nout=3)

    assert M.shape == M_mat.T.shape
    assert S.shape == S_mat.shape
    assert V.shape == V_mat.shape
    np.testing.assert_allclose(M, M_mat.T, rtol=1e-4)
    np.testing.assert_allclose(S, S_mat, rtol=1e-4)
    np.testing.assert_allclose(V, V_mat, rtol=1e-4)
Пример #7
0
def see_progression(pilco_name='saved/pilco-continuous-cartpole-5',
                    transfer_name='{:d}true_dyn_pi_adj.pkl',
                    adjust=True):
    env = gym.make('continuous-cartpole-v99')
    env.seed(1)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco(pilco_name,
                       controller=controller,
                       reward=R,
                       sparse=False)

    rewards = []

    for i in range(10):
        print('Running {:s}'.format(transfer_name.format(i)))
        if adjust:
            with open(transfer_name.format(i), 'rb') as inp2:
                pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('Score for Model {:d}'.format(i))
        state = env.reset()
        step = 0
        while True:
            step += 1

            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))

            if adjust:
                pi_adjust_action = pi_adjust.predict(
                    np.array(a).reshape(1, -1))[0]
            else:
                pi_adjust_action = 0  # ENABLE THIS TO SEE IT RUN WITHOUT THE ADJUSTMENT

            state_next, reward, terminal, info = env.step(u_action +
                                                          pi_adjust_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print('Run: {:d}, score: {:d}'.format(i, step))
                score_logger.add_score(step, i)
                break

        rewards.append(step)

    env.close()
    return rewards
Пример #8
0
def run_transfer_model_and_plot_pos(env_name, pilco_name, fig_file_name, fig_title, transfer_name=None, save=True):
    env = gym.make(env_name)
    env.seed(1)
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(pilco_name), controller=controller, reward=R, sparse=False)

    if transfer_name is not None:
        with open(transfer_name, 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

    xs = []
    angles = []

    state = env.reset()
    for _ in range(1000):
        xs.append(state[0])
        angles.append(state[2])
        env.render()

        u_action = policy(env, pilco, state, False)
        state_copy = state

        a = np.ndarray.tolist(state_copy)
        a.extend(np.ndarray.tolist(u_action))

        if transfer_name is not None:
            pi_adjust_action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]
        else:
            pi_adjust_action = 0

        state_next, reward, terminal, info = env.step(u_action + pi_adjust_action)
        state = state_next

        if terminal:
            break

    env.close()

    xs = np.array(xs)
    angles = np.array(angles)

    plt.plot(xs, angles)
    plt.quiver(xs[:-1], angles[:-1], xs[1:] - xs[:-1], angles[1:] - angles[:-1], scale_units='xy', angles='xy', scale=1, color='blue', width=1e-2)
    plt.xlabel('position')
    plt.ylabel('angle')
    plt.title(fig_title)
    plt.xlim(-0.2, 0.2)
    plt.ylim(-0.2, 0.2)

    if save:
        plt.savefig(fig_file_name, bbox_inches="tight")
        plt.close()
Пример #9
0
def souce_loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    env = gym.make('continuous-cartpole-v99')

    pi_adjust = None

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    avg_reward = 0
    while run != 101:
        run += 1
        if (run % 20 == 0):
            print('run:  ', run)
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            # env.render()

            # TODO RUN PI ADJUST
            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(u_action)
            reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                # print("Run: "  + ", score: " + str(step))
                score_logger.add_score(step, run)
                avg_reward = avg_reward + step
                break
    avg_reward = avg_reward / run
    env.env.close()
    return (avg_reward)
Пример #10
0
def true_loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    with open('9true_dyn_pi_adj.pkl', 'rb') as inp2:
        pi_adjust = pickle.load(inp2)

    # with open('10_pi_adj.pkl', 'rb') as inp2:
    #     good_pi = pickle.load(inp2)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    run = 0
    while True:
        run += 1
        state = env.reset()
        # print(state)
        # input()
        step = 0
        while True:
            step += 1
            env.render()

            u_action = utils.policy(env, pilco, state, False)
            state_copy = state

            a = np.ndarray.tolist(state_copy)
            a.extend(np.ndarray.tolist(u_action))
            action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]

            state_next, reward, terminal, info = env.step(action + u_action)
            reward = reward if not terminal else -reward
            state = state_next

            if terminal:
                print("Run: " + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Пример #11
0
    def get_RewardFun_and_Controller(self, state_dim, control_dim, data_mean,
                                     data_std, max_yerror, W):
        # t1 = np.divide([max_yerror,0.0,0.0,0.0] - data_mean,data_std)
        # t2 = np.divide([-max_yerror,0.0,0.0,0.0] - data_mean,data_std)
        # R1 = ExponentialReward(state_dim, W, t=t1)
        # R2 = ExponentialReward(state_dim, W, t=t2)
        target = np.divide([0.0, 0.0, 0.0, 0.0] - data_mean, data_std)
        Re = ExponentialReward(state_dim=state_dim, t=target, W=W)
        # R = CombinedRewards(state_dim, [Re,R1,R2], coefs=[2.0,-1.0,-1.0])

        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=50)
        # num_basis_functions: number of RBFuntions, a large number of functions is flexber but increases computational demands.
        # If v==4: num_basis_functions=50
        return Re, controller
Пример #12
0

if __name__ == '__main__':
    bf = 10
    max_action = 1.0
    target = np.array([0., 0., 0., 0.])
    weights = np.diag([0.5, 0.1, 0.5, 0.25])
    T = 400

    state_dim = 4
    control_dim = 1

    with tf.Session() as sess:
        env = gym.make('continuous-cartpole-v0')

        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=bf,
                                   max_action=max_action)
        R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
        pilco = load_pilco('saved/pilco-continuous-cartpole-5',
                           controller=controller,
                           reward=R,
                           sparse=False)

        pilco.act = lambda x: pilco.compute_action(x[None, :])[0, :]

        env_s = gym.make('continuous-cartpole-v0')
        env_t = gym.make('continuous-cartpole-v1')
        padjust(env_s, env_t, pilco)
Пример #13
0
    # Initial random rollouts to generate a dataset
    X, Y, _ = rollout(env=env, pilco=None, random=True, timesteps=40)
    random_rewards = []
    for i in range(1, 3):
        X_, Y_, rewards = rollout(env=env,
                                  pilco=None,
                                  random=True,
                                  timesteps=40)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
        random_rewards.append(sum(rewards))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=5)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim)

    pilco = PILCO(X, Y, controller=controller, horizon=40)
    # Example of user provided reward function, setting a custom target state
    # R = ExponentialReward(state_dim=state_dim, t=np.array([0.1,0,0,0]))
    # pilco = PILCO(X, Y, controller=controller, horizon=40, reward=R)

    # Example of fixing a parameter, optional, for a linear controller only
    # pilco.controller.b = np.array([[0.0]])
    # pilco.controller.b.trainable = False

    for rollouts in range(3):
        pilco.optimize_models()
        pilco.optimize_policy()
Пример #14
0
def safe_swimmer_run(seed=0, logging=False):
    env = SwimmerWrapper()
    state_dim = 9
    control_dim = 2
    SUBS = 2
    maxiter = 60
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.05 * np.eye(state_dim)
    J = 1
    N = 12
    T = 25
    bf = 30
    T_sim = 100

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[0] = 0.5
    max_ang = (100 / 180 * np.pi) * 0.95
    R1 = LinearReward(state_dim, weights_l)

    C1 = SingleConstraint(1, low=-max_ang, high=max_ang, inside=False)
    C2 = SingleConstraint(2, low=-max_ang, high=max_ang, inside=False)
    C3 = SingleConstraint(3, low=-max_ang, high=max_ang, inside=False)
    R = CombinedRewards(state_dim, [R1, C1, C2, C3],
                        coefs=[1.0, -10.0, -10.0, -10.0])

    th = 0.2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=R,
                  m_init=m_init,
                  S_init=S_init)
    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    new_data = True
    eval_runs = T_sim
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        if new_data:
            pilco.optimize_models(maxiter=100)
            new_data = False
        pilco.optimize_policy(maxiter=1, restarts=2)

        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risk1 = np.zeros(T)
        predicted_risk2 = np.zeros(T)
        predicted_risk3 = np.zeros(T)
        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :]
            predicted_risk1[h], _ = C1.compute_reward(m_h, S_h)
            predicted_risk2[h], _ = C2.compute_reward(m_h, S_h)
            predicted_risk3[h], _ = C3.compute_reward(m_h, S_h)
        estimate_risk1 = 1 - np.prod(1.0 - predicted_risk1)
        estimate_risk2 = 1 - np.prod(1.0 - predicted_risk2)
        estimate_risk3 = 1 - np.prod(1.0 - predicted_risk3)
        overall_risk = 1 - (1 - estimate_risk1) * (1 - estimate_risk2) * (
            1 - estimate_risk3)
        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco,
                                         timesteps=T_sim,
                                         verbose=True,
                                         SUBS=SUBS)
            new_data = True
            # Update dataset
            X = np.vstack((X, X_new[:T, :]))
            Y = np.vstack((Y, Y_new[:T, :]))
            pilco.mgpr.set_data((X, Y))
            if estimate_risk1 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 0.75, 1.0, 1.0])
            if estimate_risk2 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 0.75, 1.0])
            if estimate_risk3 < th / 10:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 0.75])
        else:
            print("*********CHANGING***********")
            if estimate_risk1 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.5, 1.0, 1.0])
            if estimate_risk2 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.5, 1.0])
            if estimate_risk3 > th / 3:
                R.coefs.assign(R.coefs.value() * [1.0, 1.0, 1.0, 1.5])
            _, _, r = pilco.predict(m_init, S_init, T)
def safe_cars(name='', seed=0, logging=False):
    T = 25
    th = 0.05
    np.random.seed(seed)
    name = name
    J = 5
    N = 8
    eval_runs = 5
    env = LinearCars()
    # Initial random rollouts to generate a dataset
    X1, Y1, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False)
    for i in range(1,5):
        X1_, Y1_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False)
        X1 = np.vstack((X1, X1_))
        Y1 = np.vstack((Y1, Y1_))

    env = Normalised_Env(env, np.mean(X1[:,:4],0), np.std(X1[:,:4], 0), gym_env=False)
    X, Y, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False)
    for i in range(1,J):
        X_, Y_, _, _ = rollout(env, pilco=None, timesteps=T, verbose=True, random=True, render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    m_init = np.transpose(X[0,:-1,None])
    S_init = 0.1 * np.eye(state_dim)

    controller = RbfController(state_dim=state_dim, control_dim=control_dim,
                               num_basis_functions=40, max_action=0.2)

    #w1 = np.diag([1.5, 0.001, 0.001, 0.001])
    #t1 = np.divide(np.array([3.0, 1.0, 3.0, 1.0]) - env.m, env.std)
    #R1 = ExponentialReward(state_dim=state_dim, t=t1, W=w1)
    # R1 = LinearReward(state_dim=state_dim, W=np.array([0.1, 0.0, 0.0, 0.0]))
    R1 = LinearReward(state_dim=state_dim, W=np.array([1.0 * env.std[0], 0., 0., 0,]))

    # transforming the bounds [-1,1] to the new space after normalisation
    bound_x1 = 1 / env.std[0]
    bound_x2 = 1 / env.std[2]
    B = RiskOfCollision(2, [-bound_x1-env.m[0]/env.std[0], -bound_x2 - env.m[2]/env.std[2]],
                           [bound_x1 - env.m[0]/env.std[0], bound_x2 - env.m[2]/env.std[2]])

    pilco = SafePILCO((X, Y), controller=controller, mu=-300.0, reward_add=R1, reward_mult=B, horizon=T,
                      m_init=m_init, S_init=S_init)

    # define tolerance
    new_data = True
    # init = tf.global_variables_initializer()
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("***ITERATION**** ", rollouts)
        if new_data:
            pilco.optimize_models(maxiter=100, restarts=2)
            new_data = False
        pilco.optimize_policy(maxiter=20, restarts=5)
        # check safety
        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risks = np.zeros(T)
        predicted_rewards = np.zeros(T)

        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h,:], S_p[h,:,:] = m_h[:], S_h[:,:]
            predicted_risks[h], _ = B.compute_reward(m_h, S_h)
            predicted_rewards[h], _ = R1.compute_reward(m_h, S_h)
        overall_risk = 1 - np.prod(1.0-predicted_risks)

        print("Predicted episode's return: ", sum(predicted_rewards))
        print("Overall risk ", overall_risk)
        print("Mu is ", pilco.mu.numpy())
        print("bound1 ", bound_x1, " bound1 ", bound_x2)
        #print("No of ops:", len(tf.get_default_graph().get_operations()))

        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False)
            new_data = True
            X = np.vstack((X, X_new)); Y = np.vstack((Y, Y_new))
            pilco.mgpr.set_data((X, Y))
            if overall_risk < (th/4):
                pilco.mu.assign(0.75 * pilco.mu.numpy())
            if logging:
                for k in range(0, eval_runs):
                    [X_eval_, _,
                    evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]] = rollout(env, pilco,
                                                                   timesteps=T,
                                                                   verbose=False, SUBS=1,
                                                                   render=False)
                    if len(X_eval)==0:
                        X_eval = X_eval_.copy()
                    else:
                        X_eval = np.vstack((X_eval, X_eval_))
            np.savetxt("results/" + "X_"+ str(seed) + ".csv", X, delimiter=',')
            np.savetxt("results/" + "X_eval_" + str(seed) + ".csv", X_eval, delimiter=',')
            np.savetxt("results/" + "evaluation_returns_sampled_" + str(seed) + ".csv",
                        evaluation_returns_sampled, delimiter=',')
            np.savetxt("results/" + "evaluation_returns_full_" + str(seed) + ".csv",
                        evaluation_returns_full, delimiter=',')
        else:
            X_new, Y_new,_,_ = rollout(env, pilco=pilco, timesteps=T, verbose=True, render=False)
            print(m_p[:,0] - X_new[:,0])
            print(m_p[:,2] - X_new[:,2])
            print("*********CHANGING***********")
            _, _, r = pilco.predict(m_init, S_init, T)
            print(r)
            # to verify this actually changes, run the reward wrapper before and after on the same trajectory
            pilco.mu.assign(1.5 * pilco.mu.numpy())
            _, _, r = pilco.predict(m_init, S_init, T)
            print(r)
Пример #16
0
def loader(name):
    Rs = np.empty(10).reshape(1, 10)
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    for pick in range(1, 11):
        env = gym.make('continuous-cartpole-v99')

        with open(str(pick) + '_pi_adj.pkl', 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

        score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
        run = 0
        avg_reward = 0
        while run != 101:
            run += 1
            if (run % 20 == 0):
                print('run:  ', run)
            state = env.reset()
            # print(state)
            # input()
            step = 0
            while True:
                step += 1
                #env.render()

                #TODO RUN PI ADJUST
                u_action = utils.policy(env, pilco, state, False)
                state_copy = state

                a = np.ndarray.tolist(state_copy)
                a.extend(np.ndarray.tolist(u_action))
                action = pi_adjust.predict(np.array(a).reshape(1, -1))
                action = action[0]
                if action[0] > 1:
                    action[0] = 1
                elif action[0] < -1:
                    action[0] = -1
                # TODO RUN PI ADJUST COMMENT THE NEXT LINE

                state_next, reward, terminal, info = env.step(action)
                reward = reward if not terminal else -reward
                state = state_next
                if terminal:
                    # print("Run: "  + ", score: " + str(step))
                    score_logger.add_score(step, run)
                    avg_reward = avg_reward + step
                    break
        avg_reward = avg_reward / run
        env.env.close()
        Rs[0][pick - 1] = avg_reward
    return (Rs)
Пример #17
0
def safe_cars(seed=0):
    T = 25
    th = 0.10
    np.random.seed(seed)
    J = 5
    N = 5
    eval_runs = 5
    env = LinearCars()
    # Initial random rollouts to generate a dataset
    X1, Y1, _, _ = rollout(env,
                           pilco=None,
                           timesteps=T,
                           verbose=True,
                           random=True,
                           render=False)
    for i in range(1, 5):
        X1_, Y1_, _, _ = rollout(env,
                                 pilco=None,
                                 timesteps=T,
                                 verbose=True,
                                 random=True,
                                 render=False)
        X1 = np.vstack((X1, X1_))
        Y1 = np.vstack((Y1, Y1_))

    env = Normalised_Env(np.mean(X1[:, :4], 0), np.std(X1[:, :4], 0))
    X, Y, _, _ = rollout(env,
                         pilco=None,
                         timesteps=T,
                         verbose=True,
                         random=True,
                         render=False)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               pilco=None,
                               timesteps=T,
                               verbose=True,
                               random=True,
                               render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    m_init = np.transpose(X[0, :-1, None])
    S_init = 0.1 * np.eye(state_dim)

    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=40,
                               max_action=0.2)

    #w1 = np.diag([1.5, 0.001, 0.001, 0.001])
    #t1 = np.divide(np.array([3.0, 1.0, 3.0, 1.0]) - env.m, env.std)
    #R1 = ExponentialReward(state_dim=state_dim, t=t1, W=w1)
    # R1 = LinearReward(state_dim=state_dim, W=np.array([0.1, 0.0, 0.0, 0.0]))
    R1 = LinearReward(state_dim=state_dim,
                      W=np.array([
                          1.0 * env.std[0],
                          0.,
                          0.,
                          0,
                      ]))

    bound_x1 = 1 / env.std[0]
    bound_x2 = 1 / env.std[2]
    B = RiskOfCollision(
        2,
        [-bound_x1 - env.m[0] / env.std[0], -bound_x2 - env.m[2] / env.std[2]],
        [bound_x1 - env.m[0] / env.std[0], bound_x2 - env.m[2] / env.std[2]])

    pilco = SafePILCO((X, Y),
                      controller=controller,
                      mu=-300.0,
                      reward_add=R1,
                      reward_mult=B,
                      horizon=T,
                      m_init=m_init,
                      S_init=S_init)

    for model in pilco.mgpr.models:
        model.likelihood.variance.assign(0.001)
        set_trainable(model.likelihood.variance, False)

    # define tolerance
    new_data = True
    # init = tf.global_variables_initializer()
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    X_eval = []
    for rollouts in range(N):
        print("***ITERATION**** ", rollouts)
        if new_data:
            pilco.optimize_models(maxiter=100)
            new_data = False
        pilco.optimize_policy(maxiter=20, restarts=2)
        # check safety
        m_p = np.zeros((T, state_dim))
        S_p = np.zeros((T, state_dim, state_dim))
        predicted_risks = np.zeros(T)
        predicted_rewards = np.zeros(T)

        for h in range(T):
            m_h, S_h, _ = pilco.predict(m_init, S_init, h)
            m_p[h, :], S_p[h, :, :] = m_h[:], S_h[:, :]
            predicted_risks[h], _ = B.compute_reward(m_h, S_h)
            predicted_rewards[h], _ = R1.compute_reward(m_h, S_h)
        overall_risk = 1 - np.prod(1.0 - predicted_risks)

        print("Predicted episode's return: ", sum(predicted_rewards))
        print("Overall risk ", overall_risk)
        print("Mu is ", pilco.mu.numpy())
        print("bound1 ", bound_x1, " bound1 ", bound_x2)

        if overall_risk < th:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco=pilco,
                                         timesteps=T,
                                         verbose=True,
                                         render=False)
            new_data = True
            X = np.vstack((X, X_new))
            Y = np.vstack((Y, Y_new))
            pilco.mgpr.set_data((X, Y))
            if overall_risk < (th / 4):
                pilco.mu.assign(0.75 * pilco.mu.numpy())

        else:
            X_new, Y_new, _, _ = rollout(env,
                                         pilco=pilco,
                                         timesteps=T,
                                         verbose=True,
                                         render=False)
            print(m_p[:, 0] - X_new[:, 0])
            print(m_p[:, 2] - X_new[:, 2])
            print("*********CHANGING***********")
            _, _, r = pilco.predict(m_init, S_init, T)
            print(r)
            # to verify this actually changes, run the reward wrapper before and after on the same trajectory
            pilco.mu.assign(1.5 * pilco.mu.numpy())
            _, _, r = pilco.predict(m_init, S_init, T)
            print(r)
Пример #18
0
def swimmer_run(name, seed):
    env = gym.make('Swimmer-v2').env
    #env = SwimmerWrapper()
    state_dim = 8
    control_dim = 2
    SUBS = 5
    maxiter = 120
    max_action = 1.0
    m_init = np.reshape(np.zeros(state_dim),
                        (1, state_dim))  # initial state mean
    S_init = 0.005 * np.eye(state_dim)
    J = 10
    N = 15
    T = 15
    bf = 40
    T_sim = 50

    # Reward function that dicourages the joints from hitting their max angles
    weights_l = np.zeros(state_dim)
    weights_l[3] = 1.0
    max_ang = 95 / 180 * np.pi
    t1 = np.zeros(state_dim)
    t1[2] = max_ang
    w1 = 1e-6 * np.eye(state_dim)
    w1[2, 2] = 10
    t2 = np.zeros(state_dim)
    t2[1] = max_ang
    w2 = 1e-6 * np.eye(state_dim)
    w2[1, 1] = 10
    t5 = np.zeros(state_dim)
    #t5[0] = max_ang
    #w3 = 1e-6 * np.eye(state_dim)
    #w3[0,0] = 5
    t3 = np.zeros(state_dim)
    t3[2] = -max_ang
    t4 = np.zeros(state_dim)
    t4[1] = -max_ang
    #t6 = np.zeros(state_dim); t6[0] = -max_ang
    R2 = LinearReward(state_dim, weights_l)
    R3 = ExponentialReward(state_dim, W=w1, t=t1)
    R4 = ExponentialReward(state_dim, W=w2, t=t2)
    R5 = ExponentialReward(state_dim, W=w1, t=t3)
    R6 = ExponentialReward(state_dim, W=w2, t=t4)
    #R7 = ExponentialReward(state_dim, W=w3, t=t5)
    #R8 = ExponentialReward(state_dim, W=w3, t=t6)
    Rew = CombinedRewards(state_dim, [R2, R3, R4, R5, R6],
                          coefs=[1.0, -1.0, -1.0, -1.0, -1.0])
    # Rew = R2
    # Initial random rollouts to generate a dataset
    X, Y, _, _ = rollout(env,
                         None,
                         timesteps=T,
                         random=True,
                         SUBS=SUBS,
                         verbose=True,
                         render=False)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env,
                               None,
                               timesteps=T,
                               random=True,
                               SUBS=SUBS,
                               verbose=True,
                               render=False)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    # controller = LinearController(state_dim=state_dim, control_dim=control_dim, max_action=max_action)

    pilco = PILCO((X, Y),
                  controller=controller,
                  horizon=T,
                  reward=Rew,
                  m_init=m_init,
                  S_init=S_init)
    #for model in pilco.mgpr.models:
    #    model.likelihood.variance = 0.0001
    #    model.likelihood.variance.trainable = False

    logging = True  # To save results in .csv files turn this flag to True
    eval_runs = 10
    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    eval_max_timesteps = 1000 // SUBS
    X_eval = False
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models(restarts=2)
        pilco.optimize_policy(maxiter=maxiter, restarts=2)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=T_sim,
                                     verbose=True,
                                     SUBS=SUBS,
                                     render=False)

        cur_rew = 0
        for t in range(0, len(X_new)):
            cur_rew += Rew.compute_reward(
                X_new[t, 0:state_dim, None].transpose(),
                0.0001 * np.eye(state_dim))[0]
            if t == T:
                print(
                    'On this episode, on the planning horizon, PILCO reward was: ',
                    cur_rew)
        print('On this episode PILCO reward was ', cur_rew)

        gym_steps = 1000
        T_eval = gym_steps // SUBS
        # Update dataset
        X = np.vstack((X, X_new[:T, :]))
        Y = np.vstack((Y, Y_new[:T, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists(name):
                os.makedirs(name)
            np.savetxt(name + "X_" + seed + ".csv", X, delimiter=',')
            np.savetxt(name + "X_eval_" + seed + ".csv", X_eval, delimiter=',')
            np.savetxt(name + "evaluation_returns_sampled_" + seed + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt(name + "evaluation_returns_full_" + seed + ".csv",
                       evaluation_returns_full,
                       delimiter=',')
Пример #19
0
        X_, Y_ = rollout(env=env, pilco=None, random=True,  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)
    for i in range(1,24): #Gaussian/Normal distribution action sampling
        X_, Y_ = rollout(env=env, pilco=None, random="Normal",  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)
    for i in range(1,4): #No action sampling; u := 0
        X_, Y_ = rollout(env=env, pilco=None, random=None,  timesteps=80, SUBS=subs, render=False)
        X = np.vstack((X, X_)).astype(np.float64)
        Y = np.vstack((Y, Y_)).astype(np.float64)        


    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=18, max_action=env.action_space.high)
    #controller = LinearController(state_dim=state_dim, control_dim=control_dim)
    #print(X)
    #pilco = PILCO(X, Y, controller=controller, horizon=40, num_induced_points=72)
    
    # Example of user provided reward function, setting a custom target state
    R = ExponentialReward(state_dim=state_dim, t=target)
    pilco = PILCO(X, Y, controller=controller, horizon=20, reward=R, 
                        num_induced_points=72, m_init=m_init)

    # Example of fixing a parameter, optional, for a linear controller only
    #pilco.controller.b = np.array([[0.0]])
    #pilco.controller.b.trainable = False
    plot_model = False # Plot model set to false until iteration 5 is reached
    for rollouts in range(20):
        print("Optimizing models") 
def pilco_run(env,
              N,
              J,
              safe=False,
              name='',
              seed=0,
              cont=None,
              rew=None,
              SUBS=1,
              sim_timesteps=50,
              plan_timesteps=30,
              restarts=1,
              maxiter=100,
              m_init=None,
              S_init=None,
              fixed_noise=None,
              logging=False,
              eval_runs=5,
              eval_max_timesteps=None,
              variable_episode_length=False):
    np.random.seed(seed)
    X, Y, _, _ = rollout(env=env,
                         pilco=None,
                         timesteps=sim_timesteps,
                         random=True,
                         SUBS=SUBS)
    for i in range(1, J):
        X_, Y_, _, _ = rollout(env=env,
                               pilco=None,
                               timesteps=sim_timesteps,
                               random=True,
                               SUBS=SUBS)
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))
    state_dim = Y.shape[1]
    control_dim = X.shape[1] - state_dim

    if cont is None:
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=5)
    elif cont['type'] == 'rbf':
        controller = RbfController(state_dim=state_dim,
                                   control_dim=control_dim,
                                   num_basis_functions=cont['basis_functions'],
                                   max_action=cont.get('max_action', 1.0))
    elif cont['type'] == 'linear':
        controller = LinearController(state_dim=state_dim,
                                      control_dim=control_dim,
                                      max_action=cont.get('max_action', 1.0))
    else:
        ValueError('Invalid Controller')

    if rew is None:
        reward = None
    elif rew['type'] == 'exp':
        reward = ExponentialReward(state_dim=state_dim, t=rew['t'], W=rew['W'])
    else:
        ValueError('This function only handles Exponential rewards for now')

    pilco = PILCO((X, Y),
                  controller=controller,
                  reward=reward,
                  horizon=plan_timesteps,
                  m_init=m_init,
                  S_init=S_init)

    if fixed_noise is not None:
        for model in pilco.mgpr.models:
            model.likelihood.variance.assign(fixed_noise)
            set_trainable(model.likelihood.variance, False)

    evaluation_returns_full = np.zeros((N, eval_runs))
    evaluation_returns_sampled = np.zeros((N, eval_runs))
    if name == '':
        from datetime import datetime
        current_time = datetime.now()
        name = current_time.strftime("%d_%m_%Y_%H_%M_%S")
    for rollouts in range(N):
        print("**** ITERATION no", rollouts, " ****")
        pilco.optimize_models()
        pilco.optimize_policy(maxiter=maxiter, restarts=restarts)

        X_new, Y_new, _, _ = rollout(env,
                                     pilco,
                                     timesteps=sim_timesteps,
                                     SUBS=SUBS,
                                     verbose=True)

        cur_rew = 0
        X = np.vstack((X, X_new[:plan_timesteps, :]))
        Y = np.vstack((Y, Y_new[:plan_timesteps, :]))
        pilco.mgpr.set_data((X, Y))
        if logging:
            if eval_max_timesteps is None:
                eval_max_timesteps = sim_timesteps
            for k in range(0, eval_runs):
                [
                    X_eval_, _, evaluation_returns_sampled[rollouts, k],
                    evaluation_returns_full[rollouts, k]
                ] = rollout(env,
                            pilco,
                            timesteps=eval_max_timesteps,
                            verbose=False,
                            SUBS=SUBS,
                            render=False)
                if rollouts == 0 and k == 0:
                    X_eval = X_eval_.copy()
                else:
                    X_eval = np.vstack((X_eval, X_eval_))
            if not os.path.exists("results/" + name):
                os.makedirs("results/" + name)
            np.savetxt("results/" + name + "X_" + str(seed) + ".csv",
                       X,
                       delimiter=',')
            np.savetxt("results/" + name + "X_eval_" + str(seed) + ".csv",
                       X_eval,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_sampled_" +
                       str(seed) + ".csv",
                       evaluation_returns_sampled,
                       delimiter=',')
            np.savetxt("results/" + name + "evaluation_returns_full_" +
                       str(seed) + ".csv",
                       evaluation_returns_full,
                       delimiter=',')