class RL_lander(): """ """ def __init__(self, n_ep, ep_length): # Development environment variables self.is_simulation = False self.is_test_mode = False # Calculates all controls with PD (no RL) self.isTraining = False # Initialize current state variables self.p = namedtuple("p", "x y z") self.q = namedtuple("q", "w x y z") self.v = namedtuple("v", "x y z") self.omg = namedtuple("omg", "x y z") self.p.x, self.p.y, self.p.z, self.q.w, self.q.x, self.q.y, self.q.z, self.v.x, self.v.y, self.v.z, self.omg.x,\ self.omg.y, self.omg.z = 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0. # Initialize desired trust and attitude variables for attitude commands self.p_init = namedtuple("p_init", "x y z") self.p_final = namedtuple("p_final", "x y z") self.v_d = namedtuple("v_d", "x y z") if self.is_simulation: self.p_init.x, self.p_init.y, self.p_init.z = 0., 0., 1.5 self.p_final.x, self.p_final.y, self.p_final.z = 0., 0., 0.1 else: self.p_init.x, self.p_init.y, self.p_init.z = 0.1, -0.65, 1.5 self.p_final.x, self.p_final.y, self.p_final.z = 0.1, -0.65, 1. self.v_d.x, self.v_d.y, self.v_d.z = 0., 0., 0. self.T_d = 0.0 self.q_d = namedtuple("q_d", "w x y z") self.omg_d = namedtuple("omg_d", "x y z") self.omg_d.x, self.omg_d.y, self.omg_d.z = 0., 0., 0. # Initialize ROS self.main_loop_rate = 60 self.init_ROS() self.msg = AttitudeTarget() self.rl_train_msg = StateReward() self.rl_pos_controller = RLPosController(is_simulation=self.is_simulation, is_test_mode=self.is_test_mode) # Initialize RL-related variables sess = tf.Session() state_dim, action_dim = 2, 1 self.action_bound = 0.5 actor_lr, critic_lr = 0.0002, 0.0004 gamma, tau = 0.995, 0.0001 self.minibatch_size = 512 self.learner = Learner(sess, 0., state_dim, action_dim, self.action_bound, actor_lr, critic_lr, tau, gamma, self.minibatch_size) self.l_mix = 1000. self.counter = 0 # Create Replay Buffer to track explored points (for different areas of state) self.rl_buffer_high = ReplayBuffer(1000000) # ((z1, zdot1), T, r, (z2, zdot2)) self.rl_buffer_mid = ReplayBuffer(1000000) self.rl_buffer_low = ReplayBuffer(1000000) # Train message initialization self.z_RL_prev = 0. self.zdot_RL_prev = 0. self.cur_reward_prev = 0. self.T_z_prev = 0. self.prior_prev = 0. self.end_of_ep_prev = False # Initialize safety filter (barrier function) a_max = 0.44 # Saturation accounting for hover a_min = -0.56 self.cbf = Barrier(state_dim, action_dim, a_max, a_min) self.n_ep = n_ep self.save_ep_reward = np.empty(n_ep) self.z_RL = 0. self.zdot_RL = 0. self.T_z = 0. self.prior = 0. self.t_last_rl_msg = rospy.Time.now() self.t_last_msg_p = rospy.Time.now() self.t_last_msg_v = rospy.Time.now() self.cur_reward = 0. self.land_threshold = 0.075 self.end_of_ep = False self.safety_intervention = 0. self.RL_received = False # Initialize arrays to save episodic state and control values self.ep_length = ep_length self.z_ep = np.empty((self.n_ep, self.ep_length)) self.zdot_ep = np.empty((self.n_ep, self.ep_length)) self.T_ep = np.empty((self.n_ep, self.ep_length)) self.rl_buffer = ReplayBuffer(1000000) # ((z1, zdot1), T, r, (z2, zdot2)) def output_command(self, s, a_prior): a = self.learner.actor.predict(s) + self.action_bound*self.learner.actor_noise() #return a + a_prior return a/(1+self.l_mix) + self.l_mix*a_prior/(1+self.l_mix) def output_command_noNoise(self, s, a_prior): a = self.learner.actor.predict(s) return a/(1+self.l_mix) + self.l_mix*a_prior/(1+self.l_mix) def safety_filter(self, s, a): # f = np.zeros(2) # g = np.zeros(2) [f, g, x] = self.get_dynamics(s) u_bar = self.cbf.control_barrier(a, f, g, x) return a + u_bar, u_bar def get_dynamics(self, s): T = 1./60 # Sampling Frequency G = 0. # Gravity m = 0.56/9.81 # Scaled mass s = np.squeeze(s) f = np.array([s[0] + s[1]*T, s[1] - G*T]) g = np.array([T**2/(2*m), T/m]) return [f, g, s] def train_rl(self): # Train based on replay buffer minibatch_size = self.minibatch_size if self.rl_buffer_high.count > minibatch_size/8 and self.rl_buffer_mid.count > minibatch_size/4 and self.rl_buffer_low.count > 5*minibatch_size/8: self.learner.train(self.rl_buffer_low, self.rl_buffer_mid, self.rl_buffer_high, minibatch_size) #Train def training(self): for ep in range(self.n_ep): self.counter = ep self.isTraining = False print("Resetting position...") self.reset_position() self.run_episode() def run_episode(self): cum_reward = 0. self.end_of_ep = False print("Running episode...") for t in range(self.ep_length): if self.isTraining: cum_reward += self.cur_reward self.rate.sleep() self.isTraining = True # Publish final message with end_of_ep flag set to true self.end_of_ep = True self.rate.sleep() print("Episode " + str(self.counter) + " reward: ", cum_reward) def pd_attitude_ctrl(self): #Make sure state is consistent with what is sent to RL: p = self.p v = self.v p.z = self.z_RL v.z = self.zdot_RL self.T_d, q_d, _ = self.rl_pos_controller.get_ctrl(p=p, q=self.q, v=v, omg=self.omg, p_d=self.p_final, v_d=self.v_d, T_RL=self.T_z) self.q_d.x, self.q_d.y, self.q_d.z, self.q_d.w = q_d def calc_control_prior(self): #Make sure state is consistent with what is sent to RL: p = self.p v = self.v p.z = self.z_RL v.z = self.zdot_RL self.prior = self.rl_pos_controller.get_prior(p=p, q=self.q, v=v, omg=self.omg, p_d=self.p_final, v_d=self.v_d) def calc_reward(self): reward_type = 8 #Specifies which reward to use if reward_type == 1: if self.z_RL < self.land_threshold: self.cur_reward = -self.z_RL + 1.0*self.zdot_RL # Penalize negative velocity else: self.cur_reward = -self.z_RL elif reward_type == 2: if self.z_RL < self.land_threshold: self.cur_reward = -1. + 1.0*self.zdot_RL # Penalize negative velocity else: self.cur_reward = -1. elif reward_type == 3: self.cur_reward = -self.z_RL + 1.0*self.zdot_RL/self.z_RL # Penalize negative velocity elif reward_type == 4: d = 2. self.cur_reward = max(-d, -d*abs(self.zdot_RL)/self.z_RL - d*self.z_RL) # Penalize negative velocity elif reward_type == 5: d = 0.5 self.cur_reward = min(-10*d*self.z_RL, -d*abs(self.zdot_RL)/self.z_RL - 5*d*self.z_RL) # Penalize negative velocity elif reward_type == 6: d = 20 h_alt = self.p_final.z #Hover altitude v_c = 0.5 #Max landing velocity self.cur_reward = min(-1 -abs(abs(self.z_RL) - h_alt) - self.safety_intervention, -1 -abs(abs(self.z_RL) - h_alt) -self.safety_intervention -0.2*(abs(self.zdot_RL) - v_c)/(self.z_RL**2+0.1)) elif reward_type == 7: T = 20 # Time penalty d = 20 # Scaling factor h_alt = self.p_final.z #Hover altitude v_c = 0.5 #Max landing velocity if abs(self.z_RL-h_alt) <= 0.01: # Stop penalizing time if close enough to hover altitude T = 0. self.cur_reward = min(-T -d*abs(abs(self.z_RL) - h_alt), -T -d*abs(abs(self.z_RL) - h_alt) -(abs(self.zdot_RL) - v_c)/(self.z_RL**2+0.1)) elif reward_type == 8: z_target = self.p_final.z if (self.z_RL < z_target+0.2 and self.z_RL >= z_target-0.04): self.cur_reward = -abs(self.z_RL-z_target) #- 3*abs(self.safety_intervention) elif (self.z_RL < z_target-0.04): self.cur_reward = -2*abs(self.z_RL-z_target) + min(self.zdot_RL, 0.)#- 3*abs(self.safety_intervention) else: self.cur_reward = -0.2 #- 3*abs(self.safety_intervention) def reset_position(self): # Arm the drone mavros.set_namespace() command.arming(True) rospy.wait_for_service('mavros/set_mode') change_mode = rospy.ServiceProxy('mavros/set_mode', SetMode) self.waypoint = PoseStamped() self.waypoint.header.frame_id = 'map' self.waypoint.pose.position.x = self.p_init.x self.waypoint.pose.position.y = self.p_init.y self.waypoint.pose.position.z = self.p_init.z while not rospy.is_shutdown() and (np.linalg.norm( np.array([self.p_init.x - self.p.x, self.p_init.y - self.p.y])) > 0.2 or abs(self.p_init.z - self.p.z > 0.05)): # or \ #rospy.Duration.from_sec(rospy.get_time() - self.t_last_rl_msg.to_sec()).to_sec() > 0.05): result_mode = change_mode(0, "OFFBOARD") self.pub_posmsg.publish(self.waypoint) self.rate.sleep() def init_ROS(self): self.pub_attmsg = rospy.Publisher('/mavros/setpoint_raw/attitude', AttitudeTarget, queue_size=10) self.pub_posmsg = rospy.Publisher('/mavros/setpoint_position/local', PoseStamped, queue_size=10) rospy.init_node('controller_bintel', anonymous=True) self.rate = rospy.Rate(self.main_loop_rate) # - Subscribe to local position self.local_pose = PoseStamped() self.velocity = TwistStamped() self.rc_out = RCOut() pos_sub = message_filters.Subscriber('/mavros/local_position/pose', PoseStamped) #rospy.Subscriber('/mavros/local_position/pose', PoseStamped, self._read_position) if self.is_simulation: #rospy.Subscriber('/mavros/local_position/velocity_body', TwistStamped, self._read_velocity) vel_sub = message_filters.Subscriber('/mavros/local_position/velocity_body', TwistStamped) else: #rospy.Subscriber('/mavros/local_position/velocity', TwistStamped, self._read_velocity) vel_sub = message_filters.Subscriber('/mavros/local_position/velocity_body', TwistStamped) ts = message_filters.TimeSynchronizer([pos_sub, vel_sub], 1) ts.registerCallback(self._read_pos_vel) def _read_position(self, data): self.p.x, self.p.y, self.p.z = data.pose.position.x, data.pose.position.y, data.pose.position.z self.q.w, self.q.x, self.q.y, self.q.z = data.pose.orientation.w, data.pose.orientation.x, \ data.pose.orientation.y, data.pose.orientation.z self.t_last_msg_p = rospy.Time(secs=int(data.header.stamp.secs), nsecs=data.header.stamp.nsecs) def _read_velocity(self, data): self.v.x, self.v.y, self.v.z = data.twist.linear.x, data.twist.linear.y, data.twist.linear.z self.omg.x, self.omg.y, self.omg.z = data.twist.angular.x, data.twist.angular.y, data.twist.angular.z self.t_last_msg_v = rospy.Time(secs=int(data.header.stamp.secs), nsecs=data.header.stamp.nsecs) def _read_pos_vel(self, pos_data, vel_data): self.p.x, self.p.y, self.p.z = pos_data.pose.position.x, pos_data.pose.position.y, pos_data.pose.position.z self.q.w, self.q.x, self.q.y, self.q.z = pos_data.pose.orientation.w, pos_data.pose.orientation.x, \ pos_data.pose.orientation.y, pos_data.pose.orientation.z self.t_last_msg_p = rospy.Time(secs=int(pos_data.header.stamp.secs), nsecs=pos_data.header.stamp.nsecs) self.v.x, self.v.y, self.v.z = vel_data.twist.linear.x, vel_data.twist.linear.y, vel_data.twist.linear.z self.omg.x, self.omg.y, self.omg.z = vel_data.twist.angular.x, vel_data.twist.angular.y, vel_data.twist.angular.z self.t_last_msg_v = rospy.Time(secs=int(vel_data.header.stamp.secs), nsecs=vel_data.header.stamp.nsecs) # Get state pair, control prior and reward, send to RL if (self.isTraining): self.z_RL = self.p.z self.zdot_RL = self.v.z self.calc_control_prior() self.calc_reward() s = [self.z_RL, self.zdot_RL] if (self.counter < 30): a_rl = np.squeeze(self.output_command(np.array([[self.z_RL, self.zdot_RL]]), self.prior)) self.T_z, _ = np.squeeze(self.safety_filter(s, a_rl)) else: a_rl = np.squeeze(self.output_command_noNoise(np.array([[self.z_RL, self.zdot_RL]]), self.prior)) self.T_z, _ = np.squeeze(self.safety_filter(s, a_rl)) self.create_rl_train_message(stamp=rospy.Time.now()) # Calculate control action using command from RL self.pd_attitude_ctrl() self.create_attitude_msg(stamp=rospy.Time.now()) self.pub_attmsg.publish(self.msg) def create_attitude_msg(self, stamp): ## Set the header self.msg.header.stamp = stamp self.msg.header.frame_id = 'map' ## Set message content self.msg.orientation = Quaternion(x=self.q_d.x, y=self.q_d.y, z=self.q_d.z, w=self.q_d.w) self.msg.body_rate = Vector3(x=self.omg_d.x, y=self.omg_d.y, z=self.omg_d.z) self.msg.thrust = self.T_d def create_rl_train_message(self, stamp): # Add to replay buffer s = np.array([self.z_RL_prev, self.zdot_RL_prev]) # TODO: Set s to be state at one timestep back # Define control action a_s = self.T_z_prev - self.l_mix*np.squeeze(self.prior_prev)/(1+self.l_mix) a = (self.l_mix + 1)*a_s s2 = np.array([self.z_RL, self.zdot_RL]) if abs(self.z_RL - self.z_RL_prev) < 0.2 and (s2[1] - s[1])*np.squeeze(self.T_z_prev) > 0: if (self.z_RL <= 0.2): self.rl_buffer_low.add(s, a, self.cur_reward_prev, self.end_of_ep, s2) elif (self.z_RL > 0.2 and self.z_RL < 0.8): self.rl_buffer_mid.add(s, a, self.cur_reward_prev, self.end_of_ep, s2) else: self.rl_buffer_high.add(s, a, self.cur_reward_prev, self.end_of_ep, s2) self.z_RL_prev = self.z_RL self.zdot_RL_prev = self.zdot_RL self.cur_reward_prev = self.cur_reward self.T_z_prev = self.T_z self.prior_prev = self.prior self.end_of_ep_prev = self.end_of_ep
class RL_Controller(): """ """ def __init__(self): self.is_simulation = True # Create Replay Buffer to track explored points self.rl_buffer = ReplayBuffer( 100000) # ((z1, zdot1), T, r, (z2, zdot2)) # Initialize ROS self.main_loop_rate = 60 self.init_ROS() self.rl_command_msg = AttitudeTarget() # Initialize learner sess = tf.Session() state_dim, action_dim = 2, 1 action_bound = 1.0 actor_lr, critic_lr = 0.0001, 0.0005 gamma, tau = 0.995, 0.0002 self.minibatch_size = 256 self.learner = Learner(sess, 0., state_dim, action_dim, action_bound, actor_lr, critic_lr, tau, gamma, self.minibatch_size) self.l_mix = 10. #Initialize safety filter (barrier function) a_max = 1. a_min = -1. self.cbf = Barrier(state_dim, action_dim, a_max, a_min) # Initialize variables storing current data from environment self.z = 0. self.zdot = 0. self.act = 0. self.reward = 0. self.prev_thrust = 0. self.a_prior = 0. # Initialize experiment variables self.n_ep = 1000 def run_experiment(self): last_ep = 0 training_interval = 60 #if (self.l_mix > 3): # self.l_mix = self.l_mix/1.05 for ep in range(self.n_ep): #TODO: Define number of iterations self.create_rl_command_msg( stamp=rospy.Time.now()) # Synchronize to state read self.pub.publish(self.rl_command_msg) if ep - last_ep == training_interval: last_ep = ep self.train_rl() self.rate.sleep() def output_command(self, s, a_prior): a = self.learner.actor.predict(s) + self.learner.actor_noise() return a / (1 + self.l_mix) + self.l_mix * a_prior / (1 + self.l_mix) def safety_filter(self, s, a): # f = np.zeros(2) # g = np.zeros(2) [f, g, x] = self.get_dynamics(s) u_bar = self.cbf.control_barrier(a, f, g, x) return a + u_bar, u_bar def get_dynamics(self, s): T = 1. / 60 # Sampling Frequency G = 0. # Gravity m = 0.56 / 9.81 # Scaled mass s = np.squeeze(s) f = np.array([s[0] + s[1] * T, s[1] - G * T]) g = np.array([T**2 / (2 * m), T / m]) return [f, g, s] def train_rl(self): # Train based on replay buffer minibatch_size = self.minibatch_size if self.rl_buffer.count > 2 * minibatch_size: self.learner.train(self.rl_buffer, minibatch_size) #Train """ Define ROS Interface """ def init_ROS(self): # Publish commands to topic ?? self.pub = rospy.Publisher('rl/commands', AttitudeTarget, queue_size=2) rospy.init_node('ddpg_bintel', anonymous=True) self.rate = rospy.Rate(self.main_loop_rate) # - Subscribe to state (z, zdot), reward and deployed thrust messages rospy.Subscriber('rl/training', StateReward, self._read_environment) """ Callback function to read state and output torque command """ def _read_environment(self, data): z = data.pose.position.z zdot = data.velocity.linear.z cur_reward = data.reward.data dep_thrust = data.thrust.data t_last_msg = rospy.Time(secs=int(data.header.stamp.secs), nsecs=data.header.stamp.nsecs) t = data.end_of_ep.data self.a_prior = data.prior.data # Add to replay buffer s = np.array([self.z, self.zdot ]) # TODO: Set s to be state at one timestep back # a_s = np.squeeze(dep_thrust) - self.l_mix * np.squeeze( self.a_prior) / (1 + self.l_mix) a = (self.l_mix + 1) * a_s s2 = np.array([z, zdot]) #self.rl_buffer.add(s, self.prev_thrust, cur_reward, t, s2) #self.rl_buffer.add(s, a, cur_reward, t, s2) self.rl_buffer.add(s, self.act, self.reward, t, s2) #self.create_rl_command_msg(rospy.Time.now()) #TODO: Test if creating messages here allows "parallelization" #self.pub.publish(self.rl_command_msg) self.z = z self.zdot = zdot self.act = a self.reward = cur_reward self.prev_thrust = dep_thrust def create_rl_command_msg(self, stamp): ## Set the header self.rl_command_msg.header.stamp = stamp self.rl_command_msg.header.frame_id = '/world' ## Set message content s = np.array([[self.z, self.zdot]]) #a_rl = self.output_command(s, self.a_prior) ## Use cbf to filter output self.rl_command_msg.body_rate.z = 0. self.rl_command_msg.thrust = self.output_command(s, self.a_prior)