def car(learning_method, number_of_rollouts, simulation_steps, learning_episodes, actor_structure, critic_structure, train_dir, nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False, safe_training=False, shields=1, episode_len=100, penalty_ratio=0.1): # Set up the actual model def f(x, u): return np.matrix([[x[1, 0]], [0.001 * u[0, 0] - 0.0025 * np.cos(3 * x[0, 0])]]) def f_to_str(K): kstr = K_to_str(K) f = [] f.append("x[2]") f.append("0.001*{} - 0.0025 * cos(3 * x[1])".format(kstr[0])) return f def rewardf(x, Q, u, R): return x[0, 0] - 0.6 def testf(x, u): return x[0, 0] < -np.pi / 3 def terminalf(x): return x[0, 0] >= 0.6 x_min = np.array([[-1.2], [-0.007]]) x_max = np.array([[0.7], [0.007]]) s_min = np.array([[-0.5], [0.0]]) s_max = np.array([[-0.5], [0.0]]) u_min = np.array([[-1.0]]) u_max = np.array([[1.0]]) # Set up a linearized model # We'll use splits at -0.1 to 0.1 around each peak (since the dynamics # use cos(3x), the peaks are at multiples of pi / 3). breaks = [-np.pi / 3 - 0.2, -np.pi / 3 + 0.2, -0.2, 0.2, np.pi / 3 - 0.2] break_breaks = [5, 5, 5] mins = [ -np.cos(-np.pi - 0.6), -np.cos(-np.pi + 0.6), -1, -1, np.cos(np.pi - 0.6) ] maxes = [1, 1, -np.cos(-0.6), -np.cos(0.6), 1] lower_As = [] upper_As = [] B = np.array([[0.0], [0.001]]) for i in range(len(breaks) - 1): max_m = (maxes[i + 1] - maxes[i]) / (breaks[i + 1] - breaks[i]) min_m = (mins[i + 1] - mins[i]) / (breaks[i + 1] - breaks[i]) # lA * x + B * u <= x' <= uA * x + B * u lower_As.append(np.matrix([[0.0, 1.0], [0.0025 * min_m, 0.0]])) upper_As.append(np.matrix([[0.0, 1.0], [0.0025 * max_m, 0.0]])) Bs = [B] * len(lower_As) # We consider unsafe behavior to be moving over the left side of the hill uA = np.matrix([[1.0, 0.0]]) ub = np.matrix([[-np.pi / 3]]) env = PolySysEnvironment(f, f_to_str, rewardf, testf, None, 2, 1, None, None, s_min, s_max, x_min=x_min, x_max=x_max, u_min=u_min, u_max=u_max, timestep=1.0, terminalf=terminalf, unsafe_A=uA, unsafe_b=ub, approx=True, breaks=breaks, break_breaks=break_breaks, lower_As=lower_As, lower_Bs=Bs, upper_As=upper_As, upper_Bs=Bs) if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_episodes, 'minibatch_size': 64, 'random_seed': 6554, # 6553 'tau': 0.005, 'model_path': train_dir + "retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 500 } else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': 0, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 500 } Ks = [np.matrix([[0.0, 0.0]])] invs = [(np.matrix([[1, 0], [-1, 0], [0, 1], [0, -1]]), np.matrix([[0.8], [0.8], [0.07], [0.07]]))] covers = [(invs[0][0], invs[0][1], np.matrix([[-0.8], [-0.07]]), np.matrix([[0.8], [0.07]]))] bound = 30 initial_shield = Shield(env, K_list=Ks, inv_list=invs, cover_list=covers, bound=bound) actor, shield = DDPG(env, args, safe_training=safe_training, shields=shields, initial_shield=initial_shield, penalty_ratio=penalty_ratio) if shield_test: shield.test_shield(actor, test_episodes, 5000) actor.sess.close()
def lanekeep (learning_method, number_of_rollouts, simulation_steps, learning_eposides, actor_structure, critic_structure, train_dir,\ nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False): v0 = 27.7 cf = 133000 cr = 98800 M = 1650 b = 1.59 a = 1.11 Iz = 2315.3 ds = 4 us = 2 disturbance_x_min = np.array([[0],[0],[-0.035],[0]]) disturbance_x_max = np.array([[0],[0],[ 0.035],[0]]) #Dynamics that are defined as a continuous function! def f (x, u): rd = random.uniform(-0.6, 0.6) delta = np.zeros((ds, 1), float) delta[0,0] = 1*x[1,0] + v0*x[2,0] + random.uniform(disturbance_x_min[0], disturbance_x_max[0]) #lateral displacement delta[1,0] = (-1*(cf+cr)/(M*v0))*x[1,0] + ((b*cr-a*cf)/(M*v0)-v0)*x[3,0] + (cf/M)*u[0,0] + random.uniform(disturbance_x_min[1], disturbance_x_max[1]) #lateral velocity delta[2,0] = x[3,0] + random.uniform(disturbance_x_min[2], disturbance_x_max[2]) #error yaw angle delta[3,0] = ((b*cr-a*cf)/(Iz*v0))*x[1,0] + (-1*(a*a*cf + b*b*cr)/(Iz*v0))*x[3,0] + (a*cf/Iz)*u[1,0] + random.uniform(disturbance_x_min[3], disturbance_x_max[3]) #yaw rate return delta #Closed loop system dynamics to text def f_to_str(K): kstr = K_to_str(K) f = [] f.append("1*x[2] + 27.7*x[3] + d[1]") f.append("(-1*(133000+98800)/(1650*27.7))*x[2] + ((1.59*98800-1.11*133000)/(1650*27.7)-27.7)*x[4] + (133000/1650)*{} + d[2]".format(kstr[0])) f.append("x[4] + d[3]") f.append("((1.59*98800-1.11*133000)/(2315.3*27.7))*x[2] + (-1*(1.11*1.11*133000 + 1.59*1.59*98800)/(2315.3*27.7))*x[4] + (1.11*133000/2315.3)*{} + d[4]".format(kstr[1])) return f h = 0.01 # amount of Gaussian noise in dynamics eq_err = 1e-2 #intial state space s_min = np.array([[ -0.1],[ -0.1], [-0.1], [ -0.1]]) s_max = np.array([[ 0.1],[ 0.1], [ 0.1], [ 0.1]]) Q = np.matrix("1 0 0 0; 0 1 0 0 ; 0 0 1 0; 0 0 0 1") R = np.matrix(".0005 0; 0 .0005") #user defined unsafety condition def unsafe_eval(x): if (x[0,0] > 0.9 or x[0, 0] < -0.9): # keep a safe distance from the car in front of you return True return False def unsafe_string(): return ["-(x[1]- -0.9)*(0.9-x[1])"] def rewardf(x, Q, u, R): reward = 0 reward += -np.dot(x.T,Q.dot(x))-np.dot(u.T,R.dot(u)) if (unsafe_eval(x)): reward -= 1e-3 return reward def testf(x, u): if (unsafe_eval(x)): return -1 return 0 # Use sheild to directly learn a linear controller u_min = np.array([[-1]]) u_max = np.array([[1]]) env = PolySysEnvironment(f, f_to_str,rewardf, testf, unsafe_string, ds, us, Q, R, s_min, s_max, u_max=u_max, u_min = u_min, disturbance_x_min=disturbance_x_min, disturbance_x_max=disturbance_x_max, timestep=h) if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': 1000, 'max_episodes': 1000, 'minibatch_size': 64, 'random_seed': 2903, 'tau': 0.005, 'model_path': train_dir+"retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 1000} else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': 1000, 'max_episodes': learning_eposides, 'minibatch_size': 64, 'random_seed': 2903, 'tau': 0.005, 'model_path': train_dir+"model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 1000} actor = DDPG(env, args) model_path = os.path.split(args['model_path'])[0]+'/' linear_func_model_name = 'K.model' model_path = model_path+linear_func_model_name+'.npy' shield = Shield(env, actor, model_path=model_path, force_learning=retrain_shield) shield.train_polysys_shield(learning_method, number_of_rollouts, simulation_steps, eq_err=eq_err, explore_mag=0.4, step_size=0.5, without_nn_guide=True, aggressive=True) if shield_test: shield.test_shield(test_episodes, 1000, mode="single")
def biology (learning_method, number_of_rollouts, simulation_steps, learning_eposides, critic_structure, actor_structure, train_dir,\ nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False): # 10-dimension and 1-input system and 1-disturbance system ds = 3 us = 2 #Dynamics that are defined as a continuous function! def f (x, u): #random disturbance #d = random.uniform(0, 20) delta = np.zeros((ds, 1), float) delta[0,0] = -0.01*x[0,0] - x[1,0]*(x[0,0]+4.5) + u[0,0] delta[1,0] = -0.025*x[1,0] + 0.000013*x[2,0] delta[2,0] = -0.093*(x[2,0] + 15) + (1/12)*u[1,0] return delta #Closed loop system dynamics to text def f_to_str(K): kstr = K_to_str(K) f = [] f.append("-0.01*x[1] - x[2]*(x[1]+4.5) + {}".format(kstr[0])) f.append("-0.025*x[2] + 0.000013*x[3]") f.append("-0.093*(x[3] + 15) + (1/12)*{}".format(kstr[1])) return f h = 0.01 # amount of Gaussian noise in dynamics eq_err = 1e-2 #intial state space s_min = np.array([[-2],[-0],[-0.1]]) s_max = np.array([[ 2],[ 0],[ 0.1]]) Q = np.zeros((ds,ds), float) R = np.zeros((us,us), float) np.fill_diagonal(Q, 1) np.fill_diagonal(R, 1) #user defined unsafety condition def unsafe_eval(x): if (x[0,0] >= 5): return True return False def unsafe_string(): return ["x[1] - 5"] def rewardf(x, Q, u, R): reward = 0 reward += -np.dot(x.T,Q.dot(x))-np.dot(u.T,R.dot(u)) if (unsafe_eval(x)): reward -= 100 return reward def testf(x, u): if (unsafe_eval(x)): print x return -1 return 0 u_min = np.array([[-50.], [-50]]) u_max = np.array([[ 50.], [ 50]]) env = PolySysEnvironment(f, f_to_str,rewardf, testf, unsafe_string, ds, us, Q, R, s_min, s_max, u_max=u_max, u_min=u_min, timestep=h) ############ Train and Test NN model ############ if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': 100, 'max_episodes': 1000, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir+"retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 1000} else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': 100, 'max_episodes': learning_eposides, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir+"model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 1000} actor = DDPG(env, args=args) #################### Shield ################# model_path = os.path.split(args['model_path'])[0]+'/' linear_func_model_name = 'K.model' model_path = model_path+linear_func_model_name+'.npy' shield = Shield(env, actor, model_path=model_path, force_learning=retrain_shield) shield.train_polysys_shield(learning_method, number_of_rollouts, simulation_steps, eq_err=eq_err, explore_mag = 0.4, step_size = 0.5, aggressive=True, without_nn_guide=True, enable_jit=True) if shield_test: shield.test_shield(test_episodes, 1000, mode="single") actor.sess.close()
def road(learning_method, number_of_rollouts, simulation_steps, learning_episodes, actor_structure, critic_structure, train_dir, nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False, safe_training=False, shields=1, episode_len=100, penalty_ratio=0.1): # States: Position, velocity, constant # Actions: Acceleration A = np.matrix([[0, 10, 0], [0, 0, 0], [0, 0, 0]]) B = np.matrix([[0], [10], [0]]) def f(x, u): dA = -0.01 + 0.02 * np.random.rand(3, 3) dB = -0.01 + 0.02 * np.random.rand(3, 1) return (A + dA) * x + (B + dB) * u def f_to_str(K): raise NotImplementedError def testf(x, u): return x[1, 0] >= 10 or x[1, 0] <= -10 s_min = np.array([[0], [0], [1]]) s_max = np.array([[0], [0], [1]]) x_goal = 3.0 max_speed = 10.0 x_min = np.array([[-100.0], [-10.0], [0.0]]) x_max = np.array([[100.0], [max_speed], [2.0]]) def rewardf(x, Q, u, R): return x[0, 0] - x_goal def terminalf(x): return x[0, 0] >= x_goal u_min = np.array([[-2.0]]) u_max = np.array([[5.0]]) breaks = [] break_breaks = [] lower_As = [A - np.full(A.shape, 0.01)] upper_As = [A + np.full(A.shape, 0.01)] lower_Bs = [B - np.full(B.shape, 0.01)] upper_Bs = [B + np.full(B.shape, 0.01)] uA = np.matrix([[0.0, -1.0, 0.0]]) ub = np.matrix([[-10.0]]) env = PolySysEnvironment(f, f_to_str, rewardf, testf, None, 3, 1, None, None, s_min, s_max, x_min=x_min, x_max=x_max, u_min=u_min, u_max=u_max, timestep=0.001, unsafe_A=uA, unsafe_b=ub, approx=True, breaks=breaks, break_breaks=break_breaks, lower_As=lower_As, lower_Bs=lower_Bs, upper_As=upper_As, upper_Bs=upper_Bs, terminalf=terminalf) if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_episodes, # originally 1000 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 5000 } else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_eposides, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 5000 } Ks = [np.matrix([[0.0, 0.0, 0.0]])] invs = [(np.matrix([[0.0, 1.0, 0.0]]), np.matrix([[max_speed - 1.0]]))] covers = [(invs[0][0], invs[0][1], np.matrix([[-1.0, -1.0, 1.0]]), np.matrix([[x_goal, max_speed - max_speed / 2.0, 1.0]]))] bound = 30 initial_shield = Shield(env, K_list=Ks, inv_list=invs, cover_list=covers, bound=bound) actor, shield = DDPG(env, args, safe_training=safe_training, shields=shields, initial_shield=initial_shield, penalty_ratio=penalty_ratio, bound=bound) if shield_test: shield.test_shield(actor, test_episodes, 5000) actor.sess.close()
def acc(learning_method, number_of_rollouts, simulation_steps, learning_episodes, actor_structure, critic_structure, train_dir, nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False, safe_training=False, shields=1, episode_len=100, penalty_ratio=0.1): # Adaptive cruise control: We use a relavice reference frame in which the # ego car gets a reward for being close to the lead car. # x0 is the negative distance between the cars # x1 is their relative speeds (positive if the ego car is catching up) # u1 is the acceleration of the ego car a_min = -3 a_max = 2 def f(x, u): #lead_a = random.random() * (a_max - a_min) + a_min lead_a = max(a_min, min(a_max, random.gauss(0, 1))) return np.matrix([[x[1, 0]], [u[0, 0] - lead_a], [0.0]]) def f_to_str(K): raise NotImplementedError def rewardf(x, Q, u, R): return x[0, 0] def testf(x, u): return x[0, 0] >= 0 def terminalf(x): # We don't terminate episodes early unless we hit a bad state return False x_min = np.array([[-5.0], [-5.0], [1.0]]) x_max = np.array([[0.5], [5.0], [1.0]]) s_min = np.array([[-1.1], [-0.1], [1.0]]) s_max = np.array([[-0.9], [0.1], [1.0]]) u_min = np.array([[a_min]]) u_max = np.array([[a_max]]) B = np.matrix([[0.0, 1.0, 0.0]]) lower_A = np.matrix([[0.0, 1.0, 0.0], [0.0, 0.0, -a_max], [0.0, 0.0, 0.0]]) upper_A = np.matrix([[0.0, 1.0, 0.0], [0.0, 0.0, -a_min], [0.0, 0.0, 0.0]]) breaks = [] break_breaks = [] uA = np.matrix([[-1.0, 0.0, 0.0]]) ub = np.matrix([[0.0]]) env = PolySysEnvironment(f, f_to_str, rewardf, testf, None, 3, 1, None, None, s_min, s_max, x_min=x_min, x_max=x_max, u_min=u_min, u_max=u_max, timestep=0.01, unsafe_A=uA, unsafe_b=ub, approx=True, breaks=breaks, break_breaks=break_breaks, lower_As=[lower_A], lower_Bs=[B], upper_As=[upper_A], upper_Bs=[B], terminalf=terminalf) if retrain_nn: args = { 'actor_lr': 0.0001, # [240, 200] 'critic_lr': 0.001, # [280, 240, 200] 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, # 100 'max_episodes': learning_episodes, # originally 1000 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 5000 } else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_eposides, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 5000 } # Initial strategy: Always brake hard Ks = [np.matrix([[0.0, 0.0, a_min]])] invs = [(np.matrix([[1.0, 0.0, 0.0]]), np.matrix([[-0.1]]))] covers = [(invs[0][0], invs[0][1], np.matrix([[-2.0], [-2.0], [1.0]]), np.matrix([[-0.5], [2.0], [1.0]]))] bound = 30 initial_shield = Shield(env, K_list=Ks, inv_list=invs, cover_list=covers, bound=bound) actor, shield = DDPG(env, args, safe_training=safe_training, shields=shields, initial_shield=initial_shield, penalty_ratio=penalty_ratio, bound=bound) if shield_test: shield.test_shield(actor, test_episodes, 5000) actor.sess.close()
def pendulum(learning_method, number_of_rollouts, simulation_steps, learning_episodes, critic_structure, actor_structure, train_dir, nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False, safe_training=False, shields=1, episode_len=100, penalty_ratio=0.1): # Two dimensional state space: angle and angular velocity. We will assume that # downward is 0, and the goal is to reach pi or -pi # One dimensional action space: torque to apply # The safety condition here will be a bound on the angular velocity of the # pendulum. l = 4.0 t = 10.0 max_speed = 1.5 def f(x, u): # Pendulums are governed by d^2 theta / d t^2 + g / l sin(theta) = 0, so we have # omega += u - g / l sin(theta), r += omega return np.matrix([[x[1, 0]], [t * u[0, 0] - 9.81 / l * np.sin(x[0, 0])], [0.0]]) def f_to_str(K): raise NotImplementedError def rewardf(x, Q, u, R): return max(-np.abs(x[0, 0] - np.pi), -np.abs(x[0, 0] + np.pi)) def testf(x, u): return np.abs(x[1, 0]) >= max_speed x_min = np.array([[-3.3], [-5.0], [1.0]]) x_max = np.array([[3.3], [5.0], [1.0]]) s_min = np.array([[0.0], [0.0], [1.0]]) s_max = np.array([[0.0], [0.0], [1.0]]) u_min = np.array([[-2.0]]) u_max = np.array([[2.0]]) # sin on -pi to pi, we'll have a ramp and a plateau piece: breaks = [ -3 * np.pi / 2 + 0.6, -np.pi / 2 - 0.6, -np.pi / 2 + 0.6, np.pi / 2 - 0.6, np.pi / 2 + 0.6, 3 * np.pi / 2 - 0.6 ] break_breaks = [6, 6, 6] mins = [ -1, -np.sin(-np.pi / 2 - 0.6), -np.sin(-np.pi / 2 + 0.6), -1, -1, -np.sin(3 * np.pi / 2 - 0.6) ] maxes = [ -np.sin(-3 * np.pi / 2 + 0.6), 1, 1, -np.sin(np.pi / 2 - 0.6), -np.sin(np.pi / 2 + 0.6), 1 ] lower_As = [] upper_As = [] B = np.array([[0.0], [t], [0.0]]) for i in range(len(breaks) - 1): max_m = (maxes[i + 1] - maxes[i]) / (breaks[i + 1] - breaks[i]) min_m = (mins[i + 1] - mins[i]) / (breaks[i + 1] - breaks[i]) # lA * x + B * u <= x' <= uA * x + B * u lower_As.append( np.matrix([[0.0, 1.0], [9.81 / l * min_m, 0.0], [0.0, 0.0]])) upper_As.append( np.matrix([[0.0, 1.0], [9.81 / l * max_m, 0.0], [0.0, 0.0]])) Bs = [B] * len(lower_As) uA = np.matrix([[0.0, -1.0, 0.0]]) ub = np.matrix([[-max_speed]]) env = PolySysEnvironment(f, f_to_str, rewardf, testf, None, 3, 1, None, None, s_min, s_max, x_min=x_min, x_max=x_max, u_min=u_min, u_max=u_max, timestep=0.01, unsafe_A=uA, unsafe_b=ub, approx=True, breaks=breaks, break_breaks=break_breaks, lower_As=lower_As, lower_Bs=Bs, upper_As=upper_As, upper_Bs=Bs) if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_episodes, 'minibatch_size': 64, 'random_seed': 6554, # 6553 'tau': 0.005, 'model_path': train_dir + "retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 500 } else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': 0, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 500 } Ks = [np.matrix([[0.0, 0.0, 0.0]])] invs = [(np.matrix([[1, 0, 0], [-1, 0, 0], [0, 1, 0], [0, -1, 0]]), np.matrix([[3.3], [3.3], [max_speed], [max_speed]]))] covers = [(invs[0][0], invs[0][1], np.matrix([[-2.0], [-max_speed / 2], [1.0]]), np.matrix([[2.0], [max_speed / 2], [1.0]]))] bound = 10 initial_shield = Shield(env, K_list=Ks, inv_list=invs, cover_list=covers, bound=bound) actor, shield = DDPG(env, args, safe_training=safe_training, shields=shields, initial_shield=initial_shield, penalty_ratio=penalty_ratio) if shield_test: shield.test_shield(actor, test_episodes, 5000) actor.sess.close()
def road(learning_method, number_of_rollouts, simulation_steps, learning_episodes, actor_structure, critic_structure, train_dir, nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False, safe_training=False, shields=1, episode_len=100, penalty_ratio=0.1): # States: pos_x, pos_y, vel_x, vel_y, constant # Actions: acc_x, acc_y A = np.matrix([[0.0, 0.0, 10.0, 0.0, 0.0], [0.0, 0.0, 0.0, 10.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]]) B = 0.5 * np.matrix([[0.0, 0.0], [0.0, 0.0], [10.0, 0.0], [0.0, 10.0], [0.0, 0.0]]) #s_min = np.array([[-0.1], [-0.1], [-0.1], [-0.1], [1]]) #s_max = np.array([[0.1], [0.1], [0.1], [0.1], [1]]) s_min = np.array([[0.0], [0.0], [0.0], [0.0], [1.0]]) s_max = np.array([[0.0], [0.0], [0.0], [0.0], [1.0]]) x_goal = 3.0 y_goal = 3.0 max_speed = 10.0 x_min = np.array([[-100.0], [-100.0], [-max_speed], [-max_speed], [0.0]]) x_max = np.array([[100.0], [100.0], [max_speed], [max_speed], [2.0]]) def f(x, u): dA = -0.06 + 0.12 * np.random.rand(5, 5) for i in range(5): dA[4, i] = 0.0 dB = -0.06 + 0.12 * np.random.rand(5, 2) for i in range(2): dB[4, i] = 0.0 return (A + dA) * x + (B + dB) * u def f_to_str(K): raise NotImplementedError def testf(x, u): return np.sqrt(x[2, 0]**2 + x[3, 0]**2) > max_speed def rewardf(x, Q, u, R): return -(abs(x[0, 0] - x_goal) + abs(x[1, 0] - y_goal)) def terminalf(x): return x[0, 0] >= x_goal and x[1, 0] >= y_goal lower_As = [A - np.full(A.shape, 0.06)] upper_As = [A + np.full(A.shape, 0.06)] lower_Bs = [B - np.full(B.shape, 0.06)] upper_Bs = [B + np.full(B.shape, 0.06)] u_min = np.array([[-2.0], [-2.0]]) u_max = np.array([[5.0], [5.0]]) # The safety property here is that sqrt(v_x^2 + v_y^2) < max_speed, but we # can't handle this constraint so we need a linear overapproximation. We # can do this by circumscribing an n-gon within the circle defined by the # actual safety constraint n = 8 xs = [] ys = [] for i in range(n): ang = 2 * i * np.pi / n + np.pi / n xs.append(max_speed * np.cos(ang)) ys.append(max_speed * np.sin(ang)) unsafe_A = [] unsafe_b = [] for i in range(n): j = (i + 1) % n if abs(xs[j] - xs[i]) < 0.000001: if xs[i] >= 0.0: unsafe_A.append(np.matrix([[0.0, 0.0, -1.0, 0.0, 0.0]])) unsafe_b.append(np.matrix([[-xs[i]]])) else: unsafe_A.append(np.matrix([[0.0, 0.0, 1.0, 0.0, 0.0]])) unsafe_b.append(np.matrix([[xs[i]]])) else: m = (ys[j] - ys[i]) / (xs[j] - xs[i]) b = ys[i] - m * xs[i] if b >= 0.0: unsafe_A.append(np.matrix([[0.0, 0.0, m, -1.0, 0.0]])) unsafe_b.append(np.matrix([[-b]])) else: unsafe_A.append(np.matrix([[0.0, 0.0, -m, 1.0, 0.0]])) unsafe_b.append(np.matrix([[b]])) breaks = [] break_breaks = [] env = PolySysEnvironment(f, f_to_str, rewardf, testf, None, 5, 2, None, None, s_min, s_max, x_min=x_min, x_max=x_max, u_min=u_min, u_max=u_max, unsafe_A=unsafe_A, unsafe_b=unsafe_b, timestep=0.01, approx=True, breaks=breaks, break_breaks=[], lower_As=lower_As, lower_Bs=lower_Bs, upper_As=upper_As, upper_Bs=upper_Bs, terminalf=terminalf) if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_episodes, # originally 1000 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 5000 } else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': episode_len, 'max_episodes': learning_episodes, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 5000 } Ks = [np.matrix([[0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0]])] inv_A = np.matrix(np.zeros((len(unsafe_A), unsafe_A[0].shape[1]))) inv_b = np.matrix(np.zeros((len(unsafe_b), 1))) for (i, (A, b)) in enumerate(zip(unsafe_A, unsafe_b)): inv_A[i] = -unsafe_A[i][0] inv_b[i, 0] = -2.0 * unsafe_b[i][0, 0] invs = [(inv_A, inv_b)] covers = [ (invs[0][0], invs[0][1], np.matrix([[-1.0, -1.0, -1.0, -1.0, 1.0]]), np.matrix([[x_goal, y_goal, max_speed / 2.0, max_speed / 2.0, 1.0]])) ] bound = 20 initial_shield = Shield(env, K_list=Ks, inv_list=invs, cover_list=covers, bound=bound) actor, shield = DDPG(env, args, safe_training=safe_training, shields=shields, initial_shield=initial_shield, penalty_ratio=penalty_ratio, bound=bound) if shield_test: shield.test_shield(actor, test_episodes, 5000) actor.sess.close()
def selfdrive(learning_method, number_of_rollouts, simulation_steps, learning_eposides, critic_structure, actor_structure, train_dir, \ nn_test=False, retrain_shield=False, shield_test=False, test_episodes=100, retrain_nn=False): # 2-dimension and 1-input system ds = 2 us = 1 #the speed is set to 2 in this case v = 2 cl = 2 cr = -2 def f(x, u): #We have two aeroplanes with 2 inputs for each controlling its own angular velocity! delta = np.zeros((ds, 1), float) delta[0, 0] = -v * (x[1, 0] - ((pow(x[1, 0], 3)) / 6)) delta[1, 0] = u[0, 0] #angular velocity (controlled by AIs) return delta #Closed loop system dynamics to text def f_to_str(K): kstr = K_to_str(K) f = [] f.append("-{}*(x[2] - ((x[2]^3)/6))".format(v)) f.append(kstr[0]) return f h = 0.1 # amount of Gaussian noise in dynamics eq_err = 1e-2 pi = 3.1415926 #intial state space s_min = np.array([[-1], [-pi / 4]]) s_max = np.array([[1], [pi / 4]]) u_min = np.array([[-10]]) u_max = np.array([[10]]) #the only portion of the entire state space that our verification is interested. bound_x_min = np.array([[None], [-pi / 2]]) bound_x_max = np.array([[None], [pi / 2]]) #sample an initial condition for system x0 = np.matrix([[random.uniform(s_min[0, 0], s_max[0, 0])], [random.uniform(s_min[1, 0], s_max[1, 0])]]) print("Sampled initial state is:\n {}".format(x0)) #reward functions Q = np.zeros((2, 2), float) np.fill_diagonal(Q, 1) R = np.zeros((1, 1), float) np.fill_diagonal(R, 1) #user defined unsafety condition def unsafe_eval(x): outbound1 = -(x[0, 0] - cr) * (cl - x[0, 0]) if (outbound1 >= 0): return True return False def unsafe_string(): return ["-(x[1]- {})*({}-x[1])".format(cr, cl)] def rewardf(x, Q, u, R): reward = 0 reward += -np.dot(x.T, Q.dot(x)) - np.dot(u.T, R.dot(u)) if (unsafe_eval(x)): reward -= 100 return reward def testf(x, u): if (unsafe_eval(x)): return -1 return 0 def random_test(f, K, simulation_steps, continuous=True, timestep=h): total_fails = 0 for i in range(100): x0 = np.matrix([[random.uniform(s_min[0, 0], s_max[0, 0])], [random.uniform(s_min[1, 0], s_max[1, 0])]]) reward = test_controller_helper(f, K, x0, simulation_steps, testf, continuous=True, timestep=h) if reward < 0: print "Failed on {}".format(x0) total_fails += 1 print("Among {} tests {} are failed.".format(100, total_fails)) names = {0: "p", 1: "gamma"} # Use sheild to directly learn a linear controller env = PolySysEnvironment(f, f_to_str, rewardf, testf, unsafe_string, ds, us, Q, R, s_min, s_max, u_max=u_max, u_min=u_min, bound_x_min=bound_x_min, bound_x_max=bound_x_max, timestep=0.1) if retrain_nn: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': 100, 'max_episodes': 1000, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "retrained_model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 1000 } else: args = { 'actor_lr': 0.0001, 'critic_lr': 0.001, 'actor_structure': actor_structure, 'critic_structure': critic_structure, 'buffer_size': 1000000, 'gamma': 0.99, 'max_episode_len': 100, 'max_episodes': learning_eposides, 'minibatch_size': 64, 'random_seed': 6553, 'tau': 0.005, 'model_path': train_dir + "model.chkp", 'enable_test': nn_test, 'test_episodes': test_episodes, 'test_episodes_len': 1000 } actor = DDPG(env, args=args) model_path = os.path.split(args['model_path'])[0] + '/' linear_func_model_name = 'K.model' model_path = model_path + linear_func_model_name + '.npy' def rewardf(x, Q, u, R): return np.matrix([[env.reward(x, u)]]) shield = Shield(env, actor, model_path=model_path, force_learning=retrain_shield) shield.train_polysys_shield(learning_method, number_of_rollouts, simulation_steps, explore_mag=0.04, step_size=0.03, without_nn_guide=True) if shield_test: shield.test_shield(test_episodes, 1000) actor.sess.close()