Пример #1
0
def run_transfer_model_and_plot_pos(env_name, pilco_name, fig_file_name, fig_title, transfer_name=None, save=True):
    env = gym.make(env_name)
    env.seed(1)
    controller = RbfController(state_dim=state_dim, control_dim=control_dim, num_basis_functions=bf, max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(pilco_name), controller=controller, reward=R, sparse=False)

    if transfer_name is not None:
        with open(transfer_name, 'rb') as inp2:
            pi_adjust = pickle.load(inp2)

    xs = []
    angles = []

    state = env.reset()
    for _ in range(1000):
        xs.append(state[0])
        angles.append(state[2])
        env.render()

        u_action = policy(env, pilco, state, False)
        state_copy = state

        a = np.ndarray.tolist(state_copy)
        a.extend(np.ndarray.tolist(u_action))

        if transfer_name is not None:
            pi_adjust_action = pi_adjust.predict(np.array(a).reshape(1, -1))[0]
        else:
            pi_adjust_action = 0

        state_next, reward, terminal, info = env.step(u_action + pi_adjust_action)
        state = state_next

        if terminal:
            break

    env.close()

    xs = np.array(xs)
    angles = np.array(angles)

    plt.plot(xs, angles)
    plt.quiver(xs[:-1], angles[:-1], xs[1:] - xs[:-1], angles[1:] - angles[:-1], scale_units='xy', angles='xy', scale=1, color='blue', width=1e-2)
    plt.xlabel('position')
    plt.ylabel('angle')
    plt.title(fig_title)
    plt.xlim(-0.2, 0.2)
    plt.ylim(-0.2, 0.2)

    if save:
        plt.savefig(fig_file_name, bbox_inches="tight")
        plt.close()
Пример #2
0
def load_and_run_model(env, name):
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    print('Running {:s}'.format(name))
    rollout(env, pilco, timesteps=T_sim, verbose=False, SUBS=SUBS)
Пример #3
0
def loader(name):
    env = gym.make('continuous-cartpole-v99')
    env.seed(73)

    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    score_logger = ScoreLogger('PI ADJUST ANALYSISSSSSSS')
    # observation_space = env.observation_space.shape[0]

    run = 0
    while True:
        run += 1
        state = env.reset()
        step = 0
        while True:
            step += 1
            env.render()

            #TODO RUN PI ADJUST
            action = utils.policy(env, pilco, state, False)
            # TODO RUN PI ADJUST COMMENT THE NEXT LINE

            state_next, reward, terminal, info = env.step(action)
            # reward = reward if not terminal else -reward
            state = state_next
            if terminal:
                print("Run: " + str(run) + ", score: " + str(step))
                score_logger.add_score(step, run)
                break

    env.env.close()
Пример #4
0
def piadjust(NT, name):
    controller = RbfController(state_dim=state_dim,
                               control_dim=control_dim,
                               num_basis_functions=bf,
                               max_action=max_action)
    R = ExponentialReward(state_dim=state_dim, t=target, W=weights)
    pilco = load_pilco('saved/pilco-continuous-cartpole-{:s}'.format(name),
                       controller=controller,
                       reward=R,
                       sparse=False)

    env_S = gym.make('continuous-cartpole-v0')
    env_S.seed(73)

    env_T = gym.make('continuous-cartpole-v99')
    env_T.seed(73)

    D_S = sampler(pilco, env_S, samples_n=30, trials=50)
    # print('D_S sampling done')

    D_T = None
    pi_adj = pilco

    for i in range(NT):
        print('{:d}/{:d}'.format(i + 1, NT))

        D_adj = []

        if i == 0:
            D_i_T = sampler(pilco, env_T, samples_n=30)
        elif i != 0:
            D_i_T = sampler_adj(pi_adj, pilco, env_T, 30)

        if D_T is not None:
            D_T = np.concatenate((D_i_T, D_T))
        elif D_T is None:
            D_T = D_i_T

        # print('Going for inverse dyn')
        gpr = inverse_dyn(D_T)
        # print('inverse dyn done')

        for samp in D_S:
            x_s = list(samp[0])
            x_s1 = list(samp[2])
            u_t_S = samp[1]

            a = np.array(x_s + x_s1).reshape(1, 8)
            u_t_T = gpr.predict(a, return_std=False)

            D_adj.append((x_s, u_t_S, u_t_T - u_t_S))

        # print('Going for L3')
        pi_adj = L3(D_adj)
        # print('L3 Done')

        # i = i + 1
        # if (i % 1 == 0):
        save_object(
            pi_adj,
            'transfer-save/pilco-{:s}-transfer-{:d}.pkl'.format(name, i))

    env_S.env.close()
    env_T.env.close()

    return pi_adj
Пример #5
0
    X, Y = runner.run(policy=pilco_policy, controller=controller, timesteps=100)
else:
    logger.info("Get real data from random policy")
    X, Y = runner.run()
    for i in range(1, 5):
        X_, Y_ = runner.run()
        X = np.vstack((X, X_))
        Y = np.vstack((Y, Y_))

# get env reward
env_reward = get_env_reward(args.env_name)

# load gp
if args.load_gp_model:
    try:
        pilco = load_pilco(model_path, controller, env_reward, debug=args.debug)
        logger.info("load gp model successfully")
    except:
        raise ValueError("can not find saved gp models")
        # pilco = PILCO(X, Y, controller=controller, reward=env_reward)
else:
    pilco = PILCO(X, Y, controller=controller, reward=env_reward, debug=args.debug)

# for numerical stability
# for model in pilco.mgpr.models:
    # model.likelihood.variance = 0.001
    # model.likelihood.variance.trainable = False

X_init = X[:, 0: state_dim]

# def random_policy(obs):