Exemplo n.º 1
0
def init_gym_pol(hyperparams):
    """Generates a random inital policy for gym experiments."""
    env = gym.make(hyperparams['env'])

    if is_goal_based(env):
        dX = (env.observation_space.spaces['observation'].shape[0] +
              env.observation_space.spaces['desired_goal'].shape[0])
    else:
        dX = env.observation_space.shape[0]
    dU = env.action_space.shape[0]
    T = hyperparams['T']

    low = np.asarray(env.action_space.low)
    high = np.array(env.action_space.high)

    K = np.zeros((T, dU, dX))
    k = np.empty((T, dU))
    PSig = np.empty((T, dU, dU))
    cholPSig = np.empty((T, dU, dU))
    inv_pol_covar = np.empty((T, dU, dU))

    for t in range(T):
        k[t] = (low + high) / 2
        PSig[t] = np.diag(
            np.square(high - low) / 12) * hyperparams['init_var_scale']
        cholPSig[t] = np.linalg.cholesky(PSig[t])
        inv_pol_covar[t] = np.linalg.inv(PSig[t])

    return LinearGaussianPolicy(K, k, PSig, cholPSig, inv_pol_covar)
Exemplo n.º 2
0
def init_pd(hyperparams):
    """
    This function initializes the linear-Gaussian controller as a
    proportional-derivative (PD) controller with Gaussian noise. The
    position gains are controlled by the variable pos_gains, velocity
    gains are controlled by pos_gains*vel_gans_mult.
    """
    config = copy.deepcopy(INIT_LG_PD)
    config.update(hyperparams)

    dU, dQ, dX = config['dU'], config['dQ'], config['dX']
    x0, T = config['x0'], config['T']

    # Choose initialization mode.
    Kp = 1.0
    Kv = config['vel_gains_mult']
    if dU < dQ:
        K = -config['pos_gains'] * np.tile(
            [np.eye(dU) * Kp, np.zeros((dU, dQ-dU)),
             np.eye(dU) * Kv, np.zeros((dU, dQ-dU))],
            [T, 1, 1]
        )
    else:
        K = -config['pos_gains'] * np.tile(
            np.hstack([
                np.eye(dU) * Kp, np.eye(dU) * Kv,
                np.zeros((dU, dX - dU*2))
            ]), [T, 1, 1]
        )
    k = np.tile(-K[0, :, :].dot(x0), [T, 1])
    PSig = config['init_var'] * np.tile(np.eye(dU), [T, 1, 1])
    cholPSig = np.sqrt(config['init_var']) * np.tile(np.eye(dU), [T, 1, 1])
    invPSig = (1.0 / config['init_var']) * np.tile(np.eye(dU), [T, 1, 1])

    return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
Exemplo n.º 3
0
def init_pol_ctr(hyperparams):
    """Generates an initial policy by loading linear controllers from a file."""
    dU, dX = hyperparams['dU'], hyperparams['dX']
    T = hyperparams['T']
    data = np.load(hyperparams['ctr_file'])

    K = np.empty((T, dU, dX))
    k = np.empty((T, dU))
    prc = np.empty((T, dU, dU))

    K[:-1] = data['K'][0]
    K[-1] = np.zeros((dU, dX))
    k[:-1] = data['k'][0]
    k[-1] = np.zeros((dU))
    prc[:-1] = data['prc'][0]
    prc[-1] = np.eye(dU)

    PSig = np.empty((T, dU, dU))
    cholPSig = np.empty((T, dU, dU))
    inv_pol_covar = np.empty((T, dU, dU))

    for t in range(T):
        PSig[t] = np.linalg.inv(prc[t])
        cholPSig[t] = np.linalg.cholesky(PSig[t])
        inv_pol_covar[t] = np.linalg.inv(PSig[t])

    return LinearGaussianPolicy(K, k, PSig, cholPSig, inv_pol_covar)
Exemplo n.º 4
0
def init_superball(hyperparams):
    config = copy.deepcopy(INIT_LG_PD)
    config.update(hyperparams)

    T, dU, dX = config['T'], config['dU'], config['dX']

    if 'init' in config and config['init']:
        filename = hyperparams[
            'file'] if 'file' in hyperparams else 'init_controllers.mat'
        from scipy import io
        s = config['init']
        controllers = io.loadmat(filename)
        K, k = controllers['K' + s], controllers['k' + s]
        PSig = controllers['PS' + s]
        cholPSig = controllers['cPS' + s]
        invPSig = controllers['iPS' + s]
        if 'init_with_var' in config and config['init_with_var']:
            identity_matrix = np.stack(
                [np.eye(PSig.shape[1]) for _ in xrange(PSig.shape[0])])
            PSig = identity_matrix * config['init_with_var']
            cholPSig = identity_matrix * np.sqrt(config['init_with_var'])
            invPSig = identity_matrix / config['init_with_var']

    else:
        K = np.zeros((T, dU, dX))
        k = np.zeros((T, dU))
        PSig = config['init_var'] * np.tile(np.eye(dU), [T, 1, 1])
        cholPSig = np.sqrt(config['init_var']) * np.tile(np.eye(dU), [T, 1, 1])
        invPSig = (1.0 / config['init_var']) * np.tile(np.eye(dU), [T, 1, 1])

    return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
Exemplo n.º 5
0
def init_cmaes_controller(hyperparams, agent):

    config = copy.deepcopy(INIT_LG)
    config.update(hyperparams)

    dX, dU = config['dX'], config['dU']
    T = config['T']
    cur_cond_idx = config['cur_cond_idx']
    history_len = agent.history_len
    fcn = agent.fcns[cur_cond_idx]
    popsize = agent.popsize
    if 'fcn_obj' in fcn:
        fcn_obj = fcn['fcn_obj']
    else:
        fcn_obj = None
    hpolib = False
    if 'hpolib' in fcn:
        hpolib = True
    benchmark = None
    if 'benchmark' in fcn:
        benchmark = fcn['benchmark']
    # Create new world to avoiding changing the state of the original world
    world = CMAESWorld(dim=fcn['dim'], init_loc=fcn['init_loc'], init_sigma=fcn['init_sigma'], init_popsize=popsize, history_len=history_len, fcn=fcn_obj, hpolib=hpolib, benchmark=benchmark)

    if config['verbose']:
        print("Finding Initial Linear Gaussian Controller")
    action_mean = []
    action_var = []
    for i in range(25):
            f_values=[]
            cur_policy = CSAPolicy(T=T)

            world.reset_world()
            world.run()
            for t in range(T):
                X_t = agent.get_vectorized_state(world.get_state(), cur_cond_idx)
                es = world.es
                f_vals = world.func_values
                U_t = cur_policy.act(X_t, None, t, np.zeros((dU,)), es, f_vals)
                world.run_next(U_t)
                f_values.append(U_t)
            action_mean.append(f_values)# np.mean(f_values, axis=0))
            action_var.append(f_values)# np.mean(f_values, axis=0))
    mean_actions = np.mean(action_mean, axis=0)
    var_actions = np.std(action_var, axis=0)
    np.place(var_actions, var_actions==0, config["init_var"]) 
    Kt = np.zeros((dU, dX)) # K matrix for a single time step.

    kt = mean_actions.reshape((T,1))
    #print("Mean actions: %s" % kt, flush=True)

    K = np.tile(Kt[None,:,:], (T, 1, 1))     # Controller gains matrix.
    k = kt
    PSig = var_actions.reshape((T, 1, 1))
    cholPSig = np.sqrt(var_actions).reshape((T, 1, 1))
    invPSig = 1./var_actions.reshape((T, 1, 1))

    return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
Exemplo n.º 6
0
def fit_emp_controller(demo_x, demo_u):
    """ Fit the conditional covariance of u|x of a set of demonstrations. """
    T = np.shape(demo_u)[1]
    dX = np.shape(demo_x)[2]
    dU = np.shape(demo_u)[2]
    N = np.shape(demo_u)[0]

    K = np.zeros((T, dU, dX))
    k = np.zeros((T, dU))
    pol_covar = np.zeros((T, dU, dU))
    chol_pol_covar = np.zeros((T, dU, dU))
    inv_pol_covar = np.zeros((T, dU, dU))
    traj = LinearGaussianPolicy(K, k, pol_covar, chol_pol_covar, inv_pol_covar)

    for t in xrange(T):
        X = np.hstack((np.squeeze(demo_x[:, t, :]), np.ones((N, 1))))
        U = np.squeeze(demo_u[:, t, :])
        result = np.linalg.pinv(X).dot(U).T
        traj.K[t, :, :] = result[:, 0: dX]
        traj.k[t, :] = result[:, -1]

        sig = np.zeros((dU, dU))
        for n in xrange(N):
            diff = (demo_u[n, t, :] - (traj.K[t, :, :].dot(demo_x[n, t, :]) + traj.k[t, :])).reshape(dU, -1)
            sig += diff.dot(diff.T)
        traj.pol_covar[t, :, :] = sig / N + 1e-12 * np.eye(dU)

        traj.chol_pol_covar[t, :, :] = np.linalg.cholesky(traj.pol_covar[t, :, :])
        traj.inv_pol_covar[t, :, :] = np.linalg.pinv(traj.pol_covar[t, :, :])
    return traj
Exemplo n.º 7
0
 def traj_distr(self):
     """ Create a trajectory distribution object from policy info. """
     T, dU, dX = self.pol_K.shape
     # Compute inverse policy covariances.
     inv_pol_S = np.empty_like(self.chol_pol_S)
     for t in range(T):
         inv_pol_S[t, :, :] = np.linalg.solve(
             self.chol_pol_S[t, :, :],
             np.linalg.solve(self.chol_pol_S[t, :, :].T, np.eye(dU)))
     return LinearGaussianPolicy(self.pol_K, self.pol_k, self.pol_S,
                                 self.chol_pol_S, inv_pol_S)
Exemplo n.º 8
0
def init_pol(hyperparams):
    """Generates an initial policy of constant actions with added noise."""
    dU, dX = hyperparams['dU'], hyperparams['dX']
    T = hyperparams['T']

    K = np.zeros((T, dU, dX))
    k = np.empty((T, dU))
    PSig = np.empty((T, dU, dU))
    cholPSig = np.empty((T, dU, dU))
    inv_pol_covar = np.empty((T, dU, dU))

    for t in range(T):
        k[t] = hyperparams['init_const']
        PSig[t] = hyperparams['init_var']
        cholPSig[t] = np.linalg.cholesky(PSig[t])
        inv_pol_covar[t] = np.linalg.inv(PSig[t])

    return LinearGaussianPolicy(K, k, PSig, cholPSig, inv_pol_covar)
Exemplo n.º 9
0
def init_lqr(hyperparams):
    """
    Return initial gains for a time-varying linear Gaussian controller
    that tries to hold the initial position.
    """
    config = copy.deepcopy(INIT_LG_LQR)
    config.update(hyperparams)

    x0, dX, dU = config['x0'], config['dX'], config['dU']
    dt, T = config['dt'], config['T']

    #TODO: Use packing instead of assuming which indices are the joint
    #      angles.

    # Notation notes:
    # L = loss, Q = q-function (dX+dU dimensional),
    # V = value function (dX dimensional), F = dynamics
    # Vectors are lower-case, matrices are upper case.
    # Derivatives: x = state, u = action, t = state+action (trajectory).
    # The time index is denoted by _t after the above.
    # Ex. Ltt_t = Loss, 2nd derivative (w.r.t. trajectory),
    # indexed by time t.

    # Constants.
    idx_x = slice(dX)  # Slices out state.
    idx_u = slice(dX, dX+dU)  # Slices out actions.

    if len(config['init_acc']) == 0:
        config['init_acc'] = np.zeros(dU)

    if len(config['init_gains']) == 0:
        config['init_gains'] = np.ones(dU)

    # Set up simple linear dynamics model.
    Fd, fc = guess_dynamics(config['init_gains'], config['init_acc'],
                            dX, dU, dt)

    # Setup a cost function based on stiffness.
    # Ltt = (dX+dU) by (dX+dU) - Hessian of loss with respect to
    # trajectory at a single timestep.
    Ltt = np.diag(np.hstack([
        config['stiffness'] * np.ones(dU),
        config['stiffness'] * config['stiffness_vel'] * np.ones(dU),
        np.zeros(dX - dU*2), np.ones(dU)
    ]))
    Ltt = Ltt / config['init_var']  # Cost function - quadratic term.
    lt = -Ltt.dot(np.r_[x0, np.zeros(dU)])  # Cost function - linear term.

    # Perform dynamic programming.
    K = np.zeros((T, dU, dX))  # Controller gains matrix.
    k = np.zeros((T, dU))  # Controller bias term.
    PSig = np.zeros((T, dU, dU))  # Covariance of noise.
    cholPSig = np.zeros((T, dU, dU))  # Cholesky decomposition.
    invPSig = np.zeros((T, dU, dU))  # Inverse of covariance.
    vx_t = np.zeros(dX)  # Vx = dV/dX. Derivative of value function.
    Vxx_t = np.zeros((dX, dX))  # Vxx = ddV/dXdX.

    #TODO: A lot of this code is repeated with traj_opt_lqr_python.py
    #      backward pass.
    for t in range(T - 1, -1, -1):
        # Compute Q function at this step.
        if t == (T - 1):
            Ltt_t = config['final_weight'] * Ltt
            lt_t = config['final_weight'] * lt
        else:
            Ltt_t = Ltt
            lt_t = lt
        # Qtt = (dX+dU) by (dX+dU) 2nd Derivative of Q-function with
        # respect to trajectory (dX+dU).
        Qtt_t = Ltt_t + Fd.T.dot(Vxx_t).dot(Fd)
        # Qt = (dX+dU) 1st Derivative of Q-function with respect to
        # trajectory (dX+dU).
        qt_t = lt_t + Fd.T.dot(vx_t + Vxx_t.dot(fc))

        # Compute preceding value function.
        U = sp.linalg.cholesky(Qtt_t[idx_u, idx_u])
        L = U.T

        invPSig[t, :, :] = Qtt_t[idx_u, idx_u]
        PSig[t, :, :] = sp.linalg.solve_triangular(
            U, sp.linalg.solve_triangular(L, np.eye(dU), lower=True)
        )
        cholPSig[t, :, :] = sp.linalg.cholesky(PSig[t, :, :])
        K[t, :, :] = -sp.linalg.solve_triangular(
            U, sp.linalg.solve_triangular(L, Qtt_t[idx_u, idx_x], lower=True)
        )
        k[t, :] = -sp.linalg.solve_triangular(
            U, sp.linalg.solve_triangular(L, qt_t[idx_u], lower=True)
        )
        Vxx_t = Qtt_t[idx_x, idx_x] + Qtt_t[idx_x, idx_u].dot(K[t, :, :])
        vx_t = qt_t[idx_x] + Qtt_t[idx_x, idx_u].dot(k[t, :])
        Vxx_t = 0.5 * (Vxx_t + Vxx_t.T)

    return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)
Exemplo n.º 10
0
def init_lto_controller(hyperparams, agent):
    
    config = copy.deepcopy(INIT_LG)
    config.update(hyperparams)

    dX, dU = config['dX'], config['dU']
    T = config['T']
    cur_cond_idx = config['cur_cond_idx']
    history_len = agent.history_len
    fcn = agent.fcns[cur_cond_idx]
    # Create new world to avoiding changing the state of the original world
    world = LTOWorld(fcn['fcn_obj'], fcn['dim'], fcn['init_loc'], history_len)
    
    # Compute initial state. 
    world.reset_world()
    world.run()
    x0 = agent.get_vectorized_state(world.get_state(), cur_cond_idx)
    
    best_momentum = None
    best_learning_rate = None
    min_obj_val = float('Inf')
    
    if config['verbose']:
        print("Finding Initial Linear Gaussian Controller")
    
    for i in range(config['all_possible_momentum_params'].shape[0]):
        cur_momentum = config['all_possible_momentum_params'][i]
        for j in range(config['all_possible_learning_rates'].shape[0]):
            cur_learning_rate = config['all_possible_learning_rates'][j]
            
            cur_Kt = np.zeros((dU, dX)) # K matrix for a single time step. 
            # Equivalent to Kt[:,sensor_start_idx:sensor_end_idx] = np.eye(dU)
            ## Why use CUR_GRAD and PAST_GRADS in state???
            ##agent.pack_data_x(cur_Kt, np.eye(dU), data_types=[CUR_GRAD])
            # Oldest gradients come first
            ##agent.pack_data_x(cur_Kt, np.tile(np.eye(dU),(1,history_len)) * (cur_momentum ** np.ravel(np.tile(np.arange(history_len,0,-1)[:,None],(1,dU))))[None,:], data_types=[PAST_GRADS])
            cur_Kt = -cur_learning_rate*cur_Kt
            
            cur_kt = np.dot(cur_Kt, x0)
            
            cur_policy = LinearGaussianPolicy(cur_Kt[None,:,:], cur_kt[None,:], np.zeros((1,dU,dU)), np.zeros((1,dU,dU)), np.zeros((1,dU,dU)))
            
            world.reset_world()
            world.run()
            for t in range(T):
                #print(cur_cond_idx)
                #print(world.get_state())
                X_t = agent.get_vectorized_state(world.get_state(), cur_cond_idx)
                U_t = cur_policy.act(X_t, None, 0, np.zeros((dU,)))
                world.run_next(U_t)
            fcn['fcn_obj'].new_sample(batch_size="all")
            cur_obj_val = fcn['fcn_obj'].evaluate(world.cur_loc)
            if config['verbose']:
                print("Learning Rate: %.4f, Momentum: %.4f, Final Objective Value: %.4f" % (cur_learning_rate,cur_momentum,cur_obj_val))
            if cur_obj_val < min_obj_val:
                min_obj_val = cur_obj_val
                best_momentum = cur_momentum
                best_learning_rate = cur_learning_rate
    
    if config['verbose']:
        print("")
        print("Best Final Objective Value: %.4f" % (min_obj_val))
        print("Best Momentum: %.4f" % (best_momentum))
        print("Best Learning Rate: %.4f" % (best_learning_rate))
        print("------------------------------------------------------")
            
    Kt = np.zeros((dU, dX)) # K matrix for a single time step. 
    # Equivalent to Kt[:,sensor_start_idx:sensor_end_idx] = np.eye(dU)
    ##agent.pack_data_x(Kt, np.eye(dU), data_types=[CUR_GRAD])
    # Oldest gradients come first
    ##agent.pack_data_x(Kt, np.tile(np.eye(dU),(1,history_len)) * (best_momentum ** np.ravel(np.tile(np.arange(history_len,0,-1)[:,None],(1,dU))))[None,:], data_types=[PAST_GRADS])
    Kt = -best_learning_rate*Kt
    
    kt = np.dot(Kt, x0)
    
    K = np.tile(Kt[None,:,:], (T, 1, 1))     # Controller gains matrix.
    k = np.tile(kt[None,:], (T, 1))
    PSig = np.tile((config['init_var']*np.eye(dU))[None,:,:], (T, 1, 1))
    cholPSig = np.tile((np.sqrt(config['init_var'])*np.eye(dU))[None,:,:], (T, 1, 1))
    invPSig = np.tile(((1./config['init_var'])*np.eye(dU))[None,:,:], (T, 1, 1))
    
    return LinearGaussianPolicy(K, k, PSig, cholPSig, invPSig)