Python PolySysEnvironment示例

编程语言: Python

命名空间/包名称: Environment

hotexamples.com的示例: 8

Python PolySysEnvironment - 已找到8个示例。这些是从开源项目中提取的最受好评的Environment.PolySysEnvironment现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

PolySysEnvironment(8)

示例#1

显示文件

def car(learning_method,
        number_of_rollouts,
        simulation_steps,
        learning_episodes,
        actor_structure,
        critic_structure,
        train_dir,
        nn_test=False,
        retrain_shield=False,
        shield_test=False,
        test_episodes=100,
        retrain_nn=False,
        safe_training=False,
        shields=1,
        episode_len=100,
        penalty_ratio=0.1):

    # Set up the actual model
    def f(x, u):
        return np.matrix([[x[1, 0]],
                          [0.001 * u[0, 0] - 0.0025 * np.cos(3 * x[0, 0])]])

    def f_to_str(K):
        kstr = K_to_str(K)
        f = []
        f.append("x[2]")
        f.append("0.001*{} - 0.0025 * cos(3 * x[1])".format(kstr[0]))
        return f

    def rewardf(x, Q, u, R):
        return x[0, 0] - 0.6

    def testf(x, u):
        return x[0, 0] < -np.pi / 3

    def terminalf(x):
        return x[0, 0] >= 0.6

    x_min = np.array([[-1.2], [-0.007]])
    x_max = np.array([[0.7], [0.007]])

    s_min = np.array([[-0.5], [0.0]])
    s_max = np.array([[-0.5], [0.0]])

    u_min = np.array([[-1.0]])
    u_max = np.array([[1.0]])

    # Set up a linearized model
    # We'll use splits at -0.1 to 0.1 around each peak (since the dynamics
    # use cos(3x), the peaks are at multiples of pi / 3).
    breaks = [-np.pi / 3 - 0.2, -np.pi / 3 + 0.2, -0.2, 0.2, np.pi / 3 - 0.2]
    break_breaks = [5, 5, 5]
    mins = [
        -np.cos(-np.pi - 0.6), -np.cos(-np.pi + 0.6), -1, -1,
        np.cos(np.pi - 0.6)
    ]
    maxes = [1, 1, -np.cos(-0.6), -np.cos(0.6), 1]
    lower_As = []
    upper_As = []
    B = np.array([[0.0], [0.001]])
    for i in range(len(breaks) - 1):
        max_m = (maxes[i + 1] - maxes[i]) / (breaks[i + 1] - breaks[i])
        min_m = (mins[i + 1] - mins[i]) / (breaks[i + 1] - breaks[i])
        # lA * x + B * u <= x' <= uA * x + B * u
        lower_As.append(np.matrix([[0.0, 1.0], [0.0025 * min_m, 0.0]]))
        upper_As.append(np.matrix([[0.0, 1.0], [0.0025 * max_m, 0.0]]))
    Bs = [B] * len(lower_As)

    # We consider unsafe behavior to be moving over the left side of the hill
    uA = np.matrix([[1.0, 0.0]])
    ub = np.matrix([[-np.pi / 3]])

    env = PolySysEnvironment(f,
                             f_to_str,
                             rewardf,
                             testf,
                             None,
                             2,
                             1,
                             None,
                             None,
                             s_min,
                             s_max,
                             x_min=x_min,
                             x_max=x_max,
                             u_min=u_min,
                             u_max=u_max,
                             timestep=1.0,
                             terminalf=terminalf,
                             unsafe_A=uA,
                             unsafe_b=ub,
                             approx=True,
                             breaks=breaks,
                             break_breaks=break_breaks,
                             lower_As=lower_As,
                             lower_Bs=Bs,
                             upper_As=upper_As,
                             upper_Bs=Bs)

    if retrain_nn:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_episodes,
            'minibatch_size': 64,
            'random_seed': 6554,  # 6553
            'tau': 0.005,
            'model_path': train_dir + "retrained_model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 500
        }
    else:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': 0,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 500
        }

    Ks = [np.matrix([[0.0, 0.0]])]
    invs = [(np.matrix([[1, 0], [-1, 0], [0, 1],
                        [0, -1]]), np.matrix([[0.8], [0.8], [0.07], [0.07]]))]
    covers = [(invs[0][0], invs[0][1], np.matrix([[-0.8], [-0.07]]),
               np.matrix([[0.8], [0.07]]))]

    bound = 30
    initial_shield = Shield(env,
                            K_list=Ks,
                            inv_list=invs,
                            cover_list=covers,
                            bound=bound)

    actor, shield = DDPG(env,
                         args,
                         safe_training=safe_training,
                         shields=shields,
                         initial_shield=initial_shield,
                         penalty_ratio=penalty_ratio)

    if shield_test:
        shield.test_shield(actor, test_episodes, 5000)

    actor.sess.close()

示例#2

显示文件

文件： lanekeeping.py 项目： caffett/VRL_CodeReview

def lanekeep (learning_method, number_of_rollouts, simulation_steps, learning_eposides, actor_structure, critic_structure, train_dir,\
            nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False):
  v0 = 27.7
  cf = 133000
  cr = 98800
  M  = 1650
  b  = 1.59
  a  = 1.11
  Iz = 2315.3

  ds = 4
  us = 2

  disturbance_x_min = np.array([[0],[0],[-0.035],[0]])
  disturbance_x_max = np.array([[0],[0],[ 0.035],[0]])

  #Dynamics that are defined as a continuous function!
  def f (x, u):
    rd = random.uniform(-0.6, 0.6)
    delta = np.zeros((ds, 1), float)
    delta[0,0] = 1*x[1,0] + v0*x[2,0] + random.uniform(disturbance_x_min[0], disturbance_x_max[0])                                                                        #lateral displacement
    delta[1,0] = (-1*(cf+cr)/(M*v0))*x[1,0] + ((b*cr-a*cf)/(M*v0)-v0)*x[3,0] + (cf/M)*u[0,0] + random.uniform(disturbance_x_min[1], disturbance_x_max[1])                 #lateral velocity
    delta[2,0] = x[3,0] + random.uniform(disturbance_x_min[2], disturbance_x_max[2])                                                                                      #error yaw angle
    delta[3,0] = ((b*cr-a*cf)/(Iz*v0))*x[1,0] + (-1*(a*a*cf + b*b*cr)/(Iz*v0))*x[3,0] + (a*cf/Iz)*u[1,0]  + random.uniform(disturbance_x_min[3], disturbance_x_max[3])    #yaw rate

    return delta

  #Closed loop system dynamics to text
  def f_to_str(K):
    kstr = K_to_str(K)
    f = []
    f.append("1*x[2] + 27.7*x[3] + d[1]")
    f.append("(-1*(133000+98800)/(1650*27.7))*x[2] + ((1.59*98800-1.11*133000)/(1650*27.7)-27.7)*x[4] + (133000/1650)*{} + d[2]".format(kstr[0]))
    f.append("x[4] + d[3]")
    f.append("((1.59*98800-1.11*133000)/(2315.3*27.7))*x[2] + (-1*(1.11*1.11*133000 + 1.59*1.59*98800)/(2315.3*27.7))*x[4] + (1.11*133000/2315.3)*{} + d[4]".format(kstr[1]))
    return f

  h = 0.01

  # amount of Gaussian noise in dynamics
  eq_err = 1e-2

  #intial state space
  s_min = np.array([[ -0.1],[ -0.1], [-0.1], [ -0.1]])
  s_max = np.array([[  0.1],[  0.1], [ 0.1], [  0.1]])

  Q = np.matrix("1 0 0 0; 0 1 0 0 ; 0 0 1 0; 0 0 0 1")
  R = np.matrix(".0005 0; 0 .0005")

  #user defined unsafety condition
  def unsafe_eval(x):
    if (x[0,0] > 0.9 or x[0, 0] < -0.9): # keep a safe distance from the car in front of you
      return True
    return False

  def unsafe_string():
    return ["-(x[1]- -0.9)*(0.9-x[1])"]

  def rewardf(x, Q, u, R):
    reward = 0
    reward += -np.dot(x.T,Q.dot(x))-np.dot(u.T,R.dot(u))

    if (unsafe_eval(x)):
      reward -= 1e-3
    return reward

  def testf(x, u):
    if (unsafe_eval(x)):
      return -1
    return 0 

  # Use sheild to directly learn a linear controller
  u_min = np.array([[-1]])
  u_max = np.array([[1]])
  env = PolySysEnvironment(f, f_to_str,rewardf, testf, unsafe_string, ds, us, Q, R, s_min, s_max, u_max=u_max, u_min = u_min, disturbance_x_min=disturbance_x_min, disturbance_x_max=disturbance_x_max, timestep=h)

  if retrain_nn:
    args = { 'actor_lr': 0.0001,
         'critic_lr': 0.001,
         'actor_structure': actor_structure,
         'critic_structure': critic_structure, 
         'buffer_size': 1000000,
         'gamma': 0.99,
         'max_episode_len': 1000,
         'max_episodes': 1000,
         'minibatch_size': 64,
         'random_seed': 2903,
         'tau': 0.005,
         'model_path': train_dir+"retrained_model.chkp",
         'enable_test': nn_test, 
         'test_episodes': test_episodes,
         'test_episodes_len': 1000}
  else:
    args = { 'actor_lr': 0.0001,
         'critic_lr': 0.001,
         'actor_structure': actor_structure,
         'critic_structure': critic_structure, 
         'buffer_size': 1000000,
         'gamma': 0.99,
         'max_episode_len': 1000,
         'max_episodes': learning_eposides,
         'minibatch_size': 64,
         'random_seed': 2903,
         'tau': 0.005,
         'model_path': train_dir+"model.chkp",
         'enable_test': nn_test, 
         'test_episodes': test_episodes,
         'test_episodes_len': 1000}

  actor =  DDPG(env, args)

  model_path = os.path.split(args['model_path'])[0]+'/'
  linear_func_model_name = 'K.model'
  model_path = model_path+linear_func_model_name+'.npy'


  shield = Shield(env, actor, model_path=model_path, force_learning=retrain_shield)
  shield.train_polysys_shield(learning_method, number_of_rollouts, simulation_steps, eq_err=eq_err, explore_mag=0.4, step_size=0.5, without_nn_guide=True, aggressive=True)
  if shield_test:
    shield.test_shield(test_episodes, 1000, mode="single")

示例#3

显示文件

文件： biology.py 项目： caffett/VRL_CodeReview

def biology (learning_method, number_of_rollouts, simulation_steps, learning_eposides, critic_structure, actor_structure, train_dir,\
            nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False):
  # 10-dimension and 1-input system and 1-disturbance system
  ds = 3
  us = 2

  #Dynamics that are defined as a continuous function!
  def f (x, u):
    #random disturbance
    #d = random.uniform(0, 20)
    delta = np.zeros((ds, 1), float)
    delta[0,0] = -0.01*x[0,0] - x[1,0]*(x[0,0]+4.5) + u[0,0]
    delta[1,0] = -0.025*x[1,0] + 0.000013*x[2,0]
    delta[2,0] = -0.093*(x[2,0] + 15) + (1/12)*u[1,0]
    return delta

  #Closed loop system dynamics to text
  def f_to_str(K):
    kstr = K_to_str(K)
    f = []
    f.append("-0.01*x[1] - x[2]*(x[1]+4.5) + {}".format(kstr[0]))
    f.append("-0.025*x[2] + 0.000013*x[3]")
    f.append("-0.093*(x[3] + 15) + (1/12)*{}".format(kstr[1]))
    return f

  h = 0.01

  # amount of Gaussian noise in dynamics
  eq_err = 1e-2

  #intial state space
  s_min = np.array([[-2],[-0],[-0.1]])
  s_max = np.array([[ 2],[ 0],[ 0.1]])

  Q = np.zeros((ds,ds), float)
  R = np.zeros((us,us), float)
  np.fill_diagonal(Q, 1)
  np.fill_diagonal(R, 1)

  #user defined unsafety condition
  def unsafe_eval(x):
    if (x[0,0] >= 5):
      return True
    return False
  def unsafe_string():
    return ["x[1] - 5"]

  def rewardf(x, Q, u, R):
    reward = 0
    reward += -np.dot(x.T,Q.dot(x))-np.dot(u.T,R.dot(u))
    if (unsafe_eval(x)):
      reward -= 100
    return reward

  def testf(x, u):
    if (unsafe_eval(x)):
      print x
      return -1
    return 0 

  u_min = np.array([[-50.], [-50]])
  u_max = np.array([[ 50.], [ 50]])

  env = PolySysEnvironment(f, f_to_str,rewardf, testf, unsafe_string, ds, us, Q, R, s_min, s_max, u_max=u_max, u_min=u_min, timestep=h)

  ############ Train and Test NN model ############
  if retrain_nn:
    args = { 'actor_lr': 0.0001,
             'critic_lr': 0.001,
             'actor_structure': actor_structure,
             'critic_structure': critic_structure, 
             'buffer_size': 1000000,
             'gamma': 0.99,
             'max_episode_len': 100,
             'max_episodes': 1000,
             'minibatch_size': 64,
             'random_seed': 6553,
             'tau': 0.005,
             'model_path': train_dir+"retrained_model.chkp",
             'enable_test': nn_test, 
             'test_episodes': test_episodes,
             'test_episodes_len': 1000}
  else:
    args = { 'actor_lr': 0.0001,
             'critic_lr': 0.001,
             'actor_structure': actor_structure,
             'critic_structure': critic_structure, 
             'buffer_size': 1000000,
             'gamma': 0.99,
             'max_episode_len': 100,
             'max_episodes': learning_eposides,
             'minibatch_size': 64,
             'random_seed': 6553,
             'tau': 0.005,
             'model_path': train_dir+"model.chkp",
             'enable_test': nn_test, 
             'test_episodes': test_episodes,
             'test_episodes_len': 1000}

  actor =  DDPG(env, args=args)

  #################### Shield #################
  model_path = os.path.split(args['model_path'])[0]+'/'
  linear_func_model_name = 'K.model'
  model_path = model_path+linear_func_model_name+'.npy'

  shield = Shield(env, actor, model_path=model_path, force_learning=retrain_shield)
  shield.train_polysys_shield(learning_method, number_of_rollouts, simulation_steps, eq_err=eq_err, explore_mag = 0.4, step_size = 0.5, aggressive=True, without_nn_guide=True, enable_jit=True)
  if shield_test:
    shield.test_shield(test_episodes, 1000, mode="single")
  actor.sess.close()

示例#4

显示文件

def road(learning_method,
         number_of_rollouts,
         simulation_steps,
         learning_episodes,
         actor_structure,
         critic_structure,
         train_dir,
         nn_test=False,
         retrain_shield=False,
         shield_test=False,
         test_episodes=100,
         retrain_nn=False,
         safe_training=False,
         shields=1,
         episode_len=100,
         penalty_ratio=0.1):

    # States: Position, velocity, constant
    # Actions: Acceleration

    A = np.matrix([[0, 10, 0], [0, 0, 0], [0, 0, 0]])
    B = np.matrix([[0], [10], [0]])

    def f(x, u):
        dA = -0.01 + 0.02 * np.random.rand(3, 3)
        dB = -0.01 + 0.02 * np.random.rand(3, 1)
        return (A + dA) * x + (B + dB) * u

    def f_to_str(K):
        raise NotImplementedError

    def testf(x, u):
        return x[1, 0] >= 10 or x[1, 0] <= -10

    s_min = np.array([[0], [0], [1]])
    s_max = np.array([[0], [0], [1]])

    x_goal = 3.0
    max_speed = 10.0

    x_min = np.array([[-100.0], [-10.0], [0.0]])
    x_max = np.array([[100.0], [max_speed], [2.0]])

    def rewardf(x, Q, u, R):
        return x[0, 0] - x_goal

    def terminalf(x):
        return x[0, 0] >= x_goal

    u_min = np.array([[-2.0]])
    u_max = np.array([[5.0]])

    breaks = []
    break_breaks = []
    lower_As = [A - np.full(A.shape, 0.01)]
    upper_As = [A + np.full(A.shape, 0.01)]
    lower_Bs = [B - np.full(B.shape, 0.01)]
    upper_Bs = [B + np.full(B.shape, 0.01)]

    uA = np.matrix([[0.0, -1.0, 0.0]])
    ub = np.matrix([[-10.0]])

    env = PolySysEnvironment(f,
                             f_to_str,
                             rewardf,
                             testf,
                             None,
                             3,
                             1,
                             None,
                             None,
                             s_min,
                             s_max,
                             x_min=x_min,
                             x_max=x_max,
                             u_min=u_min,
                             u_max=u_max,
                             timestep=0.001,
                             unsafe_A=uA,
                             unsafe_b=ub,
                             approx=True,
                             breaks=breaks,
                             break_breaks=break_breaks,
                             lower_As=lower_As,
                             lower_Bs=lower_Bs,
                             upper_As=upper_As,
                             upper_Bs=upper_Bs,
                             terminalf=terminalf)

    if retrain_nn:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_episodes,  # originally 1000
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "retrained_model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 5000
        }
    else:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_eposides,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 5000
        }

    Ks = [np.matrix([[0.0, 0.0, 0.0]])]
    invs = [(np.matrix([[0.0, 1.0, 0.0]]), np.matrix([[max_speed - 1.0]]))]
    covers = [(invs[0][0], invs[0][1], np.matrix([[-1.0, -1.0, 1.0]]),
               np.matrix([[x_goal, max_speed - max_speed / 2.0, 1.0]]))]

    bound = 30

    initial_shield = Shield(env,
                            K_list=Ks,
                            inv_list=invs,
                            cover_list=covers,
                            bound=bound)

    actor, shield = DDPG(env,
                         args,
                         safe_training=safe_training,
                         shields=shields,
                         initial_shield=initial_shield,
                         penalty_ratio=penalty_ratio,
                         bound=bound)

    if shield_test:
        shield.test_shield(actor, test_episodes, 5000)

    actor.sess.close()

示例#5

显示文件

def acc(learning_method,
        number_of_rollouts,
        simulation_steps,
        learning_episodes,
        actor_structure,
        critic_structure,
        train_dir,
        nn_test=False,
        retrain_shield=False,
        shield_test=False,
        test_episodes=100,
        retrain_nn=False,
        safe_training=False,
        shields=1,
        episode_len=100,
        penalty_ratio=0.1):

    # Adaptive cruise control: We use a relavice reference frame in which the
    # ego car gets a reward for being close to the lead car.
    # x0 is the negative distance between the cars
    # x1 is their relative speeds (positive if the ego car is catching up)
    # u1 is the acceleration of the ego car

    a_min = -3
    a_max = 2

    def f(x, u):
        #lead_a = random.random() * (a_max - a_min) + a_min
        lead_a = max(a_min, min(a_max, random.gauss(0, 1)))
        return np.matrix([[x[1, 0]], [u[0, 0] - lead_a], [0.0]])

    def f_to_str(K):
        raise NotImplementedError

    def rewardf(x, Q, u, R):
        return x[0, 0]

    def testf(x, u):
        return x[0, 0] >= 0

    def terminalf(x):
        # We don't terminate episodes early unless we hit a bad state
        return False

    x_min = np.array([[-5.0], [-5.0], [1.0]])
    x_max = np.array([[0.5], [5.0], [1.0]])

    s_min = np.array([[-1.1], [-0.1], [1.0]])
    s_max = np.array([[-0.9], [0.1], [1.0]])

    u_min = np.array([[a_min]])
    u_max = np.array([[a_max]])

    B = np.matrix([[0.0, 1.0, 0.0]])
    lower_A = np.matrix([[0.0, 1.0, 0.0], [0.0, 0.0, -a_max], [0.0, 0.0, 0.0]])
    upper_A = np.matrix([[0.0, 1.0, 0.0], [0.0, 0.0, -a_min], [0.0, 0.0, 0.0]])

    breaks = []
    break_breaks = []

    uA = np.matrix([[-1.0, 0.0, 0.0]])
    ub = np.matrix([[0.0]])

    env = PolySysEnvironment(f,
                             f_to_str,
                             rewardf,
                             testf,
                             None,
                             3,
                             1,
                             None,
                             None,
                             s_min,
                             s_max,
                             x_min=x_min,
                             x_max=x_max,
                             u_min=u_min,
                             u_max=u_max,
                             timestep=0.01,
                             unsafe_A=uA,
                             unsafe_b=ub,
                             approx=True,
                             breaks=breaks,
                             break_breaks=break_breaks,
                             lower_As=[lower_A],
                             lower_Bs=[B],
                             upper_As=[upper_A],
                             upper_Bs=[B],
                             terminalf=terminalf)

    if retrain_nn:
        args = {
            'actor_lr': 0.0001,  # [240, 200]
            'critic_lr': 0.001,  # [280, 240, 200]
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,  # 100
            'max_episodes': learning_episodes,  # originally 1000
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "retrained_model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 5000
        }
    else:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_eposides,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 5000
        }

    # Initial strategy: Always brake hard
    Ks = [np.matrix([[0.0, 0.0, a_min]])]
    invs = [(np.matrix([[1.0, 0.0, 0.0]]), np.matrix([[-0.1]]))]
    covers = [(invs[0][0], invs[0][1], np.matrix([[-2.0], [-2.0], [1.0]]),
               np.matrix([[-0.5], [2.0], [1.0]]))]

    bound = 30

    initial_shield = Shield(env,
                            K_list=Ks,
                            inv_list=invs,
                            cover_list=covers,
                            bound=bound)

    actor, shield = DDPG(env,
                         args,
                         safe_training=safe_training,
                         shields=shields,
                         initial_shield=initial_shield,
                         penalty_ratio=penalty_ratio,
                         bound=bound)

    if shield_test:
        shield.test_shield(actor, test_episodes, 5000)

    actor.sess.close()

示例#6

显示文件

def pendulum(learning_method,
             number_of_rollouts,
             simulation_steps,
             learning_episodes,
             critic_structure,
             actor_structure,
             train_dir,
             nn_test=False,
             retrain_shield=False,
             shield_test=False,
             test_episodes=100,
             retrain_nn=False,
             safe_training=False,
             shields=1,
             episode_len=100,
             penalty_ratio=0.1):

    # Two dimensional state space: angle and angular velocity. We will assume that
    # downward is 0, and the goal is to reach pi or -pi
    # One dimensional action space: torque to apply
    # The safety condition here will be a bound on the angular velocity of the
    # pendulum.

    l = 4.0
    t = 10.0
    max_speed = 1.5

    def f(x, u):
        # Pendulums are governed by d^2 theta / d t^2 + g / l sin(theta) = 0, so we have
        # omega += u - g / l sin(theta), r += omega
        return np.matrix([[x[1, 0]],
                          [t * u[0, 0] - 9.81 / l * np.sin(x[0, 0])], [0.0]])

    def f_to_str(K):
        raise NotImplementedError

    def rewardf(x, Q, u, R):
        return max(-np.abs(x[0, 0] - np.pi), -np.abs(x[0, 0] + np.pi))

    def testf(x, u):
        return np.abs(x[1, 0]) >= max_speed

    x_min = np.array([[-3.3], [-5.0], [1.0]])
    x_max = np.array([[3.3], [5.0], [1.0]])

    s_min = np.array([[0.0], [0.0], [1.0]])
    s_max = np.array([[0.0], [0.0], [1.0]])

    u_min = np.array([[-2.0]])
    u_max = np.array([[2.0]])

    # sin on -pi to pi, we'll have a ramp and a plateau piece:
    breaks = [
        -3 * np.pi / 2 + 0.6, -np.pi / 2 - 0.6, -np.pi / 2 + 0.6,
        np.pi / 2 - 0.6, np.pi / 2 + 0.6, 3 * np.pi / 2 - 0.6
    ]
    break_breaks = [6, 6, 6]
    mins = [
        -1, -np.sin(-np.pi / 2 - 0.6), -np.sin(-np.pi / 2 + 0.6), -1, -1,
        -np.sin(3 * np.pi / 2 - 0.6)
    ]
    maxes = [
        -np.sin(-3 * np.pi / 2 + 0.6), 1, 1, -np.sin(np.pi / 2 - 0.6),
        -np.sin(np.pi / 2 + 0.6), 1
    ]
    lower_As = []
    upper_As = []
    B = np.array([[0.0], [t], [0.0]])
    for i in range(len(breaks) - 1):
        max_m = (maxes[i + 1] - maxes[i]) / (breaks[i + 1] - breaks[i])
        min_m = (mins[i + 1] - mins[i]) / (breaks[i + 1] - breaks[i])
        # lA * x + B * u <= x' <= uA * x + B * u
        lower_As.append(
            np.matrix([[0.0, 1.0], [9.81 / l * min_m, 0.0], [0.0, 0.0]]))
        upper_As.append(
            np.matrix([[0.0, 1.0], [9.81 / l * max_m, 0.0], [0.0, 0.0]]))
    Bs = [B] * len(lower_As)

    uA = np.matrix([[0.0, -1.0, 0.0]])
    ub = np.matrix([[-max_speed]])

    env = PolySysEnvironment(f,
                             f_to_str,
                             rewardf,
                             testf,
                             None,
                             3,
                             1,
                             None,
                             None,
                             s_min,
                             s_max,
                             x_min=x_min,
                             x_max=x_max,
                             u_min=u_min,
                             u_max=u_max,
                             timestep=0.01,
                             unsafe_A=uA,
                             unsafe_b=ub,
                             approx=True,
                             breaks=breaks,
                             break_breaks=break_breaks,
                             lower_As=lower_As,
                             lower_Bs=Bs,
                             upper_As=upper_As,
                             upper_Bs=Bs)

    if retrain_nn:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_episodes,
            'minibatch_size': 64,
            'random_seed': 6554,  # 6553
            'tau': 0.005,
            'model_path': train_dir + "retrained_model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 500
        }
    else:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': 0,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 500
        }

    Ks = [np.matrix([[0.0, 0.0, 0.0]])]
    invs = [(np.matrix([[1, 0, 0], [-1, 0, 0], [0, 1, 0], [0, -1, 0]]),
             np.matrix([[3.3], [3.3], [max_speed], [max_speed]]))]
    covers = [(invs[0][0], invs[0][1],
               np.matrix([[-2.0], [-max_speed / 2],
                          [1.0]]), np.matrix([[2.0], [max_speed / 2], [1.0]]))]

    bound = 10

    initial_shield = Shield(env,
                            K_list=Ks,
                            inv_list=invs,
                            cover_list=covers,
                            bound=bound)

    actor, shield = DDPG(env,
                         args,
                         safe_training=safe_training,
                         shields=shields,
                         initial_shield=initial_shield,
                         penalty_ratio=penalty_ratio)

    if shield_test:
        shield.test_shield(actor, test_episodes, 5000)

    actor.sess.close()

示例#7

显示文件

def road(learning_method,
         number_of_rollouts,
         simulation_steps,
         learning_episodes,
         actor_structure,
         critic_structure,
         train_dir,
         nn_test=False,
         retrain_shield=False,
         shield_test=False,
         test_episodes=100,
         retrain_nn=False,
         safe_training=False,
         shields=1,
         episode_len=100,
         penalty_ratio=0.1):

    # States: pos_x, pos_y, vel_x, vel_y, constant
    # Actions: acc_x, acc_y

    A = np.matrix([[0.0, 0.0, 10.0, 0.0, 0.0], [0.0, 0.0, 0.0, 10.0, 0.0],
                   [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0],
                   [0.0, 0.0, 0.0, 0.0, 0.0]])
    B = 0.5 * np.matrix([[0.0, 0.0], [0.0, 0.0], [10.0, 0.0], [0.0, 10.0],
                         [0.0, 0.0]])

    #s_min = np.array([[-0.1], [-0.1], [-0.1], [-0.1], [1]])
    #s_max = np.array([[0.1], [0.1], [0.1], [0.1], [1]])
    s_min = np.array([[0.0], [0.0], [0.0], [0.0], [1.0]])
    s_max = np.array([[0.0], [0.0], [0.0], [0.0], [1.0]])

    x_goal = 3.0
    y_goal = 3.0

    max_speed = 10.0

    x_min = np.array([[-100.0], [-100.0], [-max_speed], [-max_speed], [0.0]])
    x_max = np.array([[100.0], [100.0], [max_speed], [max_speed], [2.0]])

    def f(x, u):
        dA = -0.06 + 0.12 * np.random.rand(5, 5)
        for i in range(5):
            dA[4, i] = 0.0
        dB = -0.06 + 0.12 * np.random.rand(5, 2)
        for i in range(2):
            dB[4, i] = 0.0
        return (A + dA) * x + (B + dB) * u

    def f_to_str(K):
        raise NotImplementedError

    def testf(x, u):
        return np.sqrt(x[2, 0]**2 + x[3, 0]**2) > max_speed

    def rewardf(x, Q, u, R):
        return -(abs(x[0, 0] - x_goal) + abs(x[1, 0] - y_goal))

    def terminalf(x):
        return x[0, 0] >= x_goal and x[1, 0] >= y_goal

    lower_As = [A - np.full(A.shape, 0.06)]
    upper_As = [A + np.full(A.shape, 0.06)]
    lower_Bs = [B - np.full(B.shape, 0.06)]
    upper_Bs = [B + np.full(B.shape, 0.06)]

    u_min = np.array([[-2.0], [-2.0]])
    u_max = np.array([[5.0], [5.0]])

    # The safety property here is that sqrt(v_x^2 + v_y^2) < max_speed, but we
    # can't handle this constraint so we need a linear overapproximation. We
    # can do this by circumscribing an n-gon within the circle defined by the
    # actual safety constraint
    n = 8
    xs = []
    ys = []
    for i in range(n):
        ang = 2 * i * np.pi / n + np.pi / n
        xs.append(max_speed * np.cos(ang))
        ys.append(max_speed * np.sin(ang))
    unsafe_A = []
    unsafe_b = []
    for i in range(n):
        j = (i + 1) % n
        if abs(xs[j] - xs[i]) < 0.000001:
            if xs[i] >= 0.0:
                unsafe_A.append(np.matrix([[0.0, 0.0, -1.0, 0.0, 0.0]]))
                unsafe_b.append(np.matrix([[-xs[i]]]))
            else:
                unsafe_A.append(np.matrix([[0.0, 0.0, 1.0, 0.0, 0.0]]))
                unsafe_b.append(np.matrix([[xs[i]]]))
        else:
            m = (ys[j] - ys[i]) / (xs[j] - xs[i])
            b = ys[i] - m * xs[i]
            if b >= 0.0:
                unsafe_A.append(np.matrix([[0.0, 0.0, m, -1.0, 0.0]]))
                unsafe_b.append(np.matrix([[-b]]))
            else:
                unsafe_A.append(np.matrix([[0.0, 0.0, -m, 1.0, 0.0]]))
                unsafe_b.append(np.matrix([[b]]))

    breaks = []
    break_breaks = []

    env = PolySysEnvironment(f,
                             f_to_str,
                             rewardf,
                             testf,
                             None,
                             5,
                             2,
                             None,
                             None,
                             s_min,
                             s_max,
                             x_min=x_min,
                             x_max=x_max,
                             u_min=u_min,
                             u_max=u_max,
                             unsafe_A=unsafe_A,
                             unsafe_b=unsafe_b,
                             timestep=0.01,
                             approx=True,
                             breaks=breaks,
                             break_breaks=[],
                             lower_As=lower_As,
                             lower_Bs=lower_Bs,
                             upper_As=upper_As,
                             upper_Bs=upper_Bs,
                             terminalf=terminalf)

    if retrain_nn:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_episodes,  # originally 1000
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "retrained_model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 5000
        }
    else:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': episode_len,
            'max_episodes': learning_episodes,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 5000
        }

    Ks = [np.matrix([[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]])]
    inv_A = np.matrix(np.zeros((len(unsafe_A), unsafe_A[0].shape[1])))
    inv_b = np.matrix(np.zeros((len(unsafe_b), 1)))
    for (i, (A, b)) in enumerate(zip(unsafe_A, unsafe_b)):
        inv_A[i] = -unsafe_A[i][0]
        inv_b[i, 0] = -2.0 * unsafe_b[i][0, 0]
    invs = [(inv_A, inv_b)]
    covers = [
        (invs[0][0], invs[0][1], np.matrix([[-1.0, -1.0, -1.0, -1.0, 1.0]]),
         np.matrix([[x_goal, y_goal, max_speed / 2.0, max_speed / 2.0, 1.0]]))
    ]

    bound = 20

    initial_shield = Shield(env,
                            K_list=Ks,
                            inv_list=invs,
                            cover_list=covers,
                            bound=bound)

    actor, shield = DDPG(env,
                         args,
                         safe_training=safe_training,
                         shields=shields,
                         initial_shield=initial_shield,
                         penalty_ratio=penalty_ratio,
                         bound=bound)

    if shield_test:
        shield.test_shield(actor, test_episodes, 5000)

    actor.sess.close()

示例#8

显示文件

def selfdrive(learning_method, number_of_rollouts, simulation_steps, learning_eposides, critic_structure, actor_structure, train_dir, \
            nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False):
    # 2-dimension and 1-input system
    ds = 2
    us = 1

    #the speed is set to 2 in this case
    v = 2
    cl = 2
    cr = -2

    def f(x, u):
        #We have two aeroplanes with 2 inputs for each controlling its own angular velocity!
        delta = np.zeros((ds, 1), float)
        delta[0, 0] = -v * (x[1, 0] - ((pow(x[1, 0], 3)) / 6))
        delta[1, 0] = u[0, 0]  #angular velocity (controlled by AIs)
        return delta

    #Closed loop system dynamics to text
    def f_to_str(K):
        kstr = K_to_str(K)
        f = []
        f.append("-{}*(x[2] - ((x[2]^3)/6))".format(v))
        f.append(kstr[0])
        return f

    h = 0.1

    # amount of Gaussian noise in dynamics
    eq_err = 1e-2

    pi = 3.1415926

    #intial state space
    s_min = np.array([[-1], [-pi / 4]])
    s_max = np.array([[1], [pi / 4]])

    u_min = np.array([[-10]])
    u_max = np.array([[10]])

    #the only portion of the entire state space that our verification is interested.
    bound_x_min = np.array([[None], [-pi / 2]])
    bound_x_max = np.array([[None], [pi / 2]])

    #sample an initial condition for system
    x0 = np.matrix([[random.uniform(s_min[0, 0], s_max[0, 0])],
                    [random.uniform(s_min[1, 0], s_max[1, 0])]])
    print("Sampled initial state is:\n {}".format(x0))

    #reward functions
    Q = np.zeros((2, 2), float)
    np.fill_diagonal(Q, 1)
    R = np.zeros((1, 1), float)
    np.fill_diagonal(R, 1)

    #user defined unsafety condition
    def unsafe_eval(x):
        outbound1 = -(x[0, 0] - cr) * (cl - x[0, 0])
        if (outbound1 >= 0):
            return True
        return False

    def unsafe_string():
        return ["-(x[1]- {})*({}-x[1])".format(cr, cl)]

    def rewardf(x, Q, u, R):
        reward = 0
        reward += -np.dot(x.T, Q.dot(x)) - np.dot(u.T, R.dot(u))
        if (unsafe_eval(x)):
            reward -= 100
        return reward

    def testf(x, u):
        if (unsafe_eval(x)):
            return -1
        return 0

    def random_test(f, K, simulation_steps, continuous=True, timestep=h):
        total_fails = 0
        for i in range(100):
            x0 = np.matrix([[random.uniform(s_min[0, 0], s_max[0, 0])],
                            [random.uniform(s_min[1, 0], s_max[1, 0])]])
            reward = test_controller_helper(f,
                                            K,
                                            x0,
                                            simulation_steps,
                                            testf,
                                            continuous=True,
                                            timestep=h)
            if reward < 0:
                print "Failed on {}".format(x0)
                total_fails += 1
        print("Among {} tests {} are failed.".format(100, total_fails))

    names = {0: "p", 1: "gamma"}

    # Use sheild to directly learn a linear controller
    env = PolySysEnvironment(f,
                             f_to_str,
                             rewardf,
                             testf,
                             unsafe_string,
                             ds,
                             us,
                             Q,
                             R,
                             s_min,
                             s_max,
                             u_max=u_max,
                             u_min=u_min,
                             bound_x_min=bound_x_min,
                             bound_x_max=bound_x_max,
                             timestep=0.1)

    if retrain_nn:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': 100,
            'max_episodes': 1000,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "retrained_model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 1000
        }
    else:
        args = {
            'actor_lr': 0.0001,
            'critic_lr': 0.001,
            'actor_structure': actor_structure,
            'critic_structure': critic_structure,
            'buffer_size': 1000000,
            'gamma': 0.99,
            'max_episode_len': 100,
            'max_episodes': learning_eposides,
            'minibatch_size': 64,
            'random_seed': 6553,
            'tau': 0.005,
            'model_path': train_dir + "model.chkp",
            'enable_test': nn_test,
            'test_episodes': test_episodes,
            'test_episodes_len': 1000
        }
    actor = DDPG(env, args=args)

    model_path = os.path.split(args['model_path'])[0] + '/'
    linear_func_model_name = 'K.model'
    model_path = model_path + linear_func_model_name + '.npy'

    def rewardf(x, Q, u, R):
        return np.matrix([[env.reward(x, u)]])

    shield = Shield(env,
                    actor,
                    model_path=model_path,
                    force_learning=retrain_shield)
    shield.train_polysys_shield(learning_method,
                                number_of_rollouts,
                                simulation_steps,
                                explore_mag=0.04,
                                step_size=0.03,
                                without_nn_guide=True)
    if shield_test:
        shield.test_shield(test_episodes, 1000)

    actor.sess.close()