Exemplo n.º 1
0
class RL_lander():
    """
    """

    def __init__(self, n_ep, ep_length):
        # Development environment variables
        self.is_simulation = False
        self.is_test_mode = False  # Calculates all controls with PD (no RL)
        self.isTraining = False

        # Initialize current state variables
        self.p = namedtuple("p", "x y z")
        self.q = namedtuple("q", "w x y z")
        self.v = namedtuple("v", "x y z")
        self.omg = namedtuple("omg", "x y z")
        self.p.x, self.p.y, self.p.z, self.q.w, self.q.x, self.q.y, self.q.z, self.v.x, self.v.y, self.v.z, self.omg.x,\
                self.omg.y, self.omg.z = 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.

        # Initialize desired trust and attitude variables for attitude commands
        self.p_init = namedtuple("p_init", "x y z")
        self.p_final = namedtuple("p_final", "x y z")
        self.v_d = namedtuple("v_d", "x y z")

        if self.is_simulation:
            self.p_init.x, self.p_init.y, self.p_init.z = 0., 0., 1.5
            self.p_final.x, self.p_final.y, self.p_final.z = 0., 0., 0.1
        else:
            self.p_init.x, self.p_init.y, self.p_init.z = 0.1, -0.65, 1.5
            self.p_final.x, self.p_final.y, self.p_final.z = 0.1, -0.65, 1.

        self.v_d.x, self.v_d.y, self.v_d.z = 0., 0., 0.
        self.T_d = 0.0
        self.q_d = namedtuple("q_d", "w x y z")
        self.omg_d = namedtuple("omg_d", "x y z")
        self.omg_d.x, self.omg_d.y, self.omg_d.z = 0., 0., 0.

        # Initialize ROS
        self.main_loop_rate = 60
        self.init_ROS()
        self.msg = AttitudeTarget()
        self.rl_train_msg = StateReward()
        self.rl_pos_controller = RLPosController(is_simulation=self.is_simulation, is_test_mode=self.is_test_mode)

        # Initialize RL-related variables
        sess = tf.Session()
        state_dim, action_dim = 2, 1
        self.action_bound = 0.5
        actor_lr, critic_lr = 0.0002, 0.0004
        gamma, tau = 0.995, 0.0001
        self.minibatch_size = 512
        self.learner = Learner(sess, 0., state_dim, action_dim, self.action_bound, actor_lr, critic_lr, tau, gamma, self.minibatch_size)
        self.l_mix = 1000.
        self.counter = 0

        # Create Replay Buffer to track explored points (for different areas of state)                  
        self.rl_buffer_high = ReplayBuffer(1000000)    # ((z1, zdot1), T, r, (z2, zdot2))               
        self.rl_buffer_mid = ReplayBuffer(1000000)
        self.rl_buffer_low = ReplayBuffer(1000000)

        # Train message initialization
        self.z_RL_prev = 0.
        self.zdot_RL_prev = 0.
        self.cur_reward_prev = 0.
        self.T_z_prev = 0.
        self.prior_prev = 0.
        self.end_of_ep_prev = False


        # Initialize safety filter (barrier function)
        a_max = 0.44  # Saturation accounting for hover                           
        a_min = -0.56
        self.cbf = Barrier(state_dim, action_dim, a_max, a_min)
        
        self.n_ep = n_ep
        self.save_ep_reward = np.empty(n_ep)
        self.z_RL = 0.
        self.zdot_RL = 0.
        self.T_z = 0.
        self.prior = 0.
        self.t_last_rl_msg = rospy.Time.now()
        self.t_last_msg_p = rospy.Time.now()
        self.t_last_msg_v = rospy.Time.now()
        self.cur_reward = 0.
        self.land_threshold = 0.075
        self.end_of_ep = False
        self.safety_intervention = 0.
        self.RL_received = False

        # Initialize arrays to save episodic state and control values
        self.ep_length = ep_length
        self.z_ep = np.empty((self.n_ep, self.ep_length))
        self.zdot_ep = np.empty((self.n_ep, self.ep_length))
        self.T_ep = np.empty((self.n_ep, self.ep_length))
        self.rl_buffer = ReplayBuffer(1000000)  # ((z1, zdot1), T, r, (z2, zdot2))


    def output_command(self, s, a_prior):
        a = self.learner.actor.predict(s) + self.action_bound*self.learner.actor_noise()
        #return a + a_prior                                                                                   
        return a/(1+self.l_mix) + self.l_mix*a_prior/(1+self.l_mix)

    def output_command_noNoise(self, s, a_prior):
        a = self.learner.actor.predict(s)
        return a/(1+self.l_mix) + self.l_mix*a_prior/(1+self.l_mix)

    def safety_filter(self, s, a):
        # f = np.zeros(2)
        # g = np.zeros(2)
        [f, g, x] = self.get_dynamics(s)
        u_bar = self.cbf.control_barrier(a, f, g, x)
        return a + u_bar, u_bar

    def get_dynamics(self, s):
        T = 1./60  # Sampling Frequency
        G = 0.   # Gravity
        m = 0.56/9.81   # Scaled mass
        s = np.squeeze(s)
        f = np.array([s[0] + s[1]*T, s[1] - G*T])
        g = np.array([T**2/(2*m), T/m])
        return [f, g, s]

    def train_rl(self):
        # Train based on replay buffer
        minibatch_size = self.minibatch_size
        if self.rl_buffer_high.count > minibatch_size/8 and self.rl_buffer_mid.count > minibatch_size/4 and self.rl_buffer_low.count > 5*minibatch_size/8:
            self.learner.train(self.rl_buffer_low, self.rl_buffer_mid, self.rl_buffer_high, minibatch_size) #Train

    def training(self):
        for ep in range(self.n_ep):
            self.counter = ep
            self.isTraining = False
            print("Resetting position...")
            self.reset_position()
            self.run_episode()

    def run_episode(self):
        cum_reward = 0.
        self.end_of_ep = False

        print("Running episode...")
        for t in range(self.ep_length):
            if self.isTraining:
                cum_reward += self.cur_reward
            self.rate.sleep()
            self.isTraining = True
        
        # Publish final message with end_of_ep flag set to true
        self.end_of_ep = True
        self.rate.sleep()
        
        print("Episode " + str(self.counter) + " reward: ", cum_reward)

    def pd_attitude_ctrl(self):
        #Make sure state is consistent with what is sent to RL:
        p = self.p
        v = self.v
        p.z = self.z_RL
        v.z = self.zdot_RL
        self.T_d, q_d, _ = self.rl_pos_controller.get_ctrl(p=p, q=self.q, v=v, omg=self.omg,
                                                           p_d=self.p_final, v_d=self.v_d, T_RL=self.T_z)
        self.q_d.x, self.q_d.y, self.q_d.z, self.q_d.w = q_d

    def calc_control_prior(self):
        #Make sure state is consistent with what is sent to RL:
        p = self.p
        v = self.v
        p.z = self.z_RL
        v.z = self.zdot_RL
        self.prior = self.rl_pos_controller.get_prior(p=p, q=self.q, v=v, omg=self.omg,
                                                   p_d=self.p_final, v_d=self.v_d)

    def calc_reward(self):
        reward_type = 8 #Specifies which  reward to use

        if reward_type == 1:
            if self.z_RL < self.land_threshold:
                self.cur_reward = -self.z_RL + 1.0*self.zdot_RL # Penalize negative velocity
            else:
                self.cur_reward = -self.z_RL
        elif reward_type == 2:
            if self.z_RL < self.land_threshold:
                self.cur_reward = -1. + 1.0*self.zdot_RL # Penalize negative velocity
            else:
                self.cur_reward = -1.
        elif reward_type == 3:
            self.cur_reward = -self.z_RL + 1.0*self.zdot_RL/self.z_RL # Penalize negative velocity
        elif reward_type == 4:
            d = 2.
            self.cur_reward = max(-d, -d*abs(self.zdot_RL)/self.z_RL - d*self.z_RL) # Penalize negative velocity
        elif reward_type == 5:
            d = 0.5
            self.cur_reward = min(-10*d*self.z_RL, -d*abs(self.zdot_RL)/self.z_RL - 5*d*self.z_RL) # Penalize negative velocity
        elif reward_type == 6:
            d = 20
            h_alt = self.p_final.z #Hover altitude
            v_c = 0.5 #Max landing velocity
            self.cur_reward = min(-1 -abs(abs(self.z_RL) - h_alt) - self.safety_intervention, -1 -abs(abs(self.z_RL) - h_alt)
                                  -self.safety_intervention -0.2*(abs(self.zdot_RL) - v_c)/(self.z_RL**2+0.1))
        elif reward_type == 7:
            T = 20 # Time penalty
            d = 20 # Scaling factor
            h_alt = self.p_final.z #Hover altitude
            v_c = 0.5 #Max landing velocity

            if abs(self.z_RL-h_alt) <= 0.01: # Stop penalizing time if close enough to hover altitude
                T = 0.

            self.cur_reward = min(-T -d*abs(abs(self.z_RL) - h_alt), -T -d*abs(abs(self.z_RL) - h_alt) -(abs(self.zdot_RL) - v_c)/(self.z_RL**2+0.1))

        elif reward_type == 8:
            z_target = self.p_final.z
            if (self.z_RL < z_target+0.2 and self.z_RL >= z_target-0.04):
                self.cur_reward = -abs(self.z_RL-z_target) #- 3*abs(self.safety_intervention)
            elif (self.z_RL < z_target-0.04):
                self.cur_reward = -2*abs(self.z_RL-z_target) + min(self.zdot_RL, 0.)#- 3*abs(self.safety_intervention)
            else:
                self.cur_reward = -0.2 #- 3*abs(self.safety_intervention)


    def reset_position(self):
        # Arm the drone
        mavros.set_namespace()
        command.arming(True)

        rospy.wait_for_service('mavros/set_mode')
        change_mode = rospy.ServiceProxy('mavros/set_mode', SetMode)

        self.waypoint = PoseStamped()
        self.waypoint.header.frame_id = 'map'
        self.waypoint.pose.position.x = self.p_init.x
        self.waypoint.pose.position.y = self.p_init.y
        self.waypoint.pose.position.z = self.p_init.z

        while not rospy.is_shutdown() and (np.linalg.norm(
                np.array([self.p_init.x - self.p.x,
                          self.p_init.y - self.p.y])) > 0.2 or abs(self.p_init.z - self.p.z > 0.05)): # or \
                          #rospy.Duration.from_sec(rospy.get_time() - self.t_last_rl_msg.to_sec()).to_sec() > 0.05):
            result_mode = change_mode(0, "OFFBOARD")
            self.pub_posmsg.publish(self.waypoint)
            self.rate.sleep()

    def init_ROS(self):
        self.pub_attmsg = rospy.Publisher('/mavros/setpoint_raw/attitude', AttitudeTarget, queue_size=10)
        self.pub_posmsg = rospy.Publisher('/mavros/setpoint_position/local', PoseStamped, queue_size=10)

        rospy.init_node('controller_bintel', anonymous=True)

        self.rate = rospy.Rate(self.main_loop_rate)

        # - Subscribe to local position
        self.local_pose = PoseStamped()
        self.velocity = TwistStamped()
        self.rc_out = RCOut()
        pos_sub = message_filters.Subscriber('/mavros/local_position/pose', PoseStamped)
        #rospy.Subscriber('/mavros/local_position/pose', PoseStamped, self._read_position)
        if self.is_simulation:
            #rospy.Subscriber('/mavros/local_position/velocity_body', TwistStamped, self._read_velocity)
            vel_sub = message_filters.Subscriber('/mavros/local_position/velocity_body', TwistStamped)
        else:
            #rospy.Subscriber('/mavros/local_position/velocity', TwistStamped, self._read_velocity)
            vel_sub = message_filters.Subscriber('/mavros/local_position/velocity_body', TwistStamped)

        ts = message_filters.TimeSynchronizer([pos_sub, vel_sub], 1)
        ts.registerCallback(self._read_pos_vel)


    def _read_position(self, data):
        self.p.x, self.p.y, self.p.z = data.pose.position.x, data.pose.position.y, data.pose.position.z
        self.q.w, self.q.x, self.q.y, self.q.z = data.pose.orientation.w, data.pose.orientation.x, \
                                                 data.pose.orientation.y, data.pose.orientation.z
        self.t_last_msg_p = rospy.Time(secs=int(data.header.stamp.secs), nsecs=data.header.stamp.nsecs)

    def _read_velocity(self, data):
        self.v.x, self.v.y, self.v.z = data.twist.linear.x, data.twist.linear.y, data.twist.linear.z
        self.omg.x, self.omg.y, self.omg.z = data.twist.angular.x, data.twist.angular.y, data.twist.angular.z
        self.t_last_msg_v = rospy.Time(secs=int(data.header.stamp.secs), nsecs=data.header.stamp.nsecs)

    def _read_pos_vel(self, pos_data, vel_data):
        self.p.x, self.p.y, self.p.z = pos_data.pose.position.x, pos_data.pose.position.y, pos_data.pose.position.z
        self.q.w, self.q.x, self.q.y, self.q.z = pos_data.pose.orientation.w, pos_data.pose.orientation.x, \
                                                 pos_data.pose.orientation.y, pos_data.pose.orientation.z
        self.t_last_msg_p = rospy.Time(secs=int(pos_data.header.stamp.secs), nsecs=pos_data.header.stamp.nsecs)

        self.v.x, self.v.y, self.v.z = vel_data.twist.linear.x, vel_data.twist.linear.y, vel_data.twist.linear.z
        self.omg.x, self.omg.y, self.omg.z = vel_data.twist.angular.x, vel_data.twist.angular.y, vel_data.twist.angular.z
        self.t_last_msg_v = rospy.Time(secs=int(vel_data.header.stamp.secs), nsecs=vel_data.header.stamp.nsecs)

        # Get state pair, control prior and reward, send to RL
        if (self.isTraining):
            self.z_RL = self.p.z
            self.zdot_RL = self.v.z
            self.calc_control_prior()
            self.calc_reward()
            s = [self.z_RL, self.zdot_RL]
            if (self.counter < 30):
                a_rl = np.squeeze(self.output_command(np.array([[self.z_RL, self.zdot_RL]]), self.prior))
                self.T_z, _ = np.squeeze(self.safety_filter(s, a_rl))
            else:
                a_rl = np.squeeze(self.output_command_noNoise(np.array([[self.z_RL, self.zdot_RL]]), self.prior))
                self.T_z, _ = np.squeeze(self.safety_filter(s, a_rl))
            self.create_rl_train_message(stamp=rospy.Time.now())

            # Calculate control action using command from RL
            self.pd_attitude_ctrl()
            self.create_attitude_msg(stamp=rospy.Time.now())
            self.pub_attmsg.publish(self.msg)

    def create_attitude_msg(self, stamp):
        ## Set the header
        self.msg.header.stamp = stamp
        self.msg.header.frame_id = 'map'

        ## Set message content
        self.msg.orientation = Quaternion(x=self.q_d.x, y=self.q_d.y, z=self.q_d.z, w=self.q_d.w)
        self.msg.body_rate = Vector3(x=self.omg_d.x, y=self.omg_d.y, z=self.omg_d.z)
        self.msg.thrust = self.T_d

    def create_rl_train_message(self, stamp):

        # Add to replay buffer                                                                                    
        s = np.array([self.z_RL_prev, self.zdot_RL_prev])  # TODO: Set s to be state at one timestep back                         

        # Define control action                                                                                   
        a_s = self.T_z_prev - self.l_mix*np.squeeze(self.prior_prev)/(1+self.l_mix)
        a = (self.l_mix + 1)*a_s

        s2 = np.array([self.z_RL, self.zdot_RL])
        if abs(self.z_RL - self.z_RL_prev) < 0.2 and (s2[1] - s[1])*np.squeeze(self.T_z_prev) > 0:
            if (self.z_RL <= 0.2):
                self.rl_buffer_low.add(s, a, self.cur_reward_prev, self.end_of_ep, s2)
            elif (self.z_RL > 0.2 and self.z_RL < 0.8):
                self.rl_buffer_mid.add(s, a, self.cur_reward_prev, self.end_of_ep, s2)
            else:
                self.rl_buffer_high.add(s, a, self.cur_reward_prev, self.end_of_ep, s2)

        self.z_RL_prev = self.z_RL
        self.zdot_RL_prev = self.zdot_RL
        self.cur_reward_prev = self.cur_reward
        self.T_z_prev = self.T_z
        self.prior_prev = self.prior
        self.end_of_ep_prev = self.end_of_ep
Exemplo n.º 2
0
class RL_Controller():
    """
    """
    def __init__(self):
        self.is_simulation = True

        # Create Replay Buffer to track explored points
        self.rl_buffer = ReplayBuffer(
            100000)  # ((z1, zdot1), T, r, (z2, zdot2))

        # Initialize ROS
        self.main_loop_rate = 60
        self.init_ROS()
        self.rl_command_msg = AttitudeTarget()

        # Initialize learner
        sess = tf.Session()
        state_dim, action_dim = 2, 1
        action_bound = 1.0
        actor_lr, critic_lr = 0.0001, 0.0005
        gamma, tau = 0.995, 0.0002
        self.minibatch_size = 256
        self.learner = Learner(sess, 0., state_dim, action_dim, action_bound,
                               actor_lr, critic_lr, tau, gamma,
                               self.minibatch_size)
        self.l_mix = 10.

        #Initialize safety filter (barrier function)
        a_max = 1.
        a_min = -1.
        self.cbf = Barrier(state_dim, action_dim, a_max, a_min)

        # Initialize variables storing current data from environment
        self.z = 0.
        self.zdot = 0.
        self.act = 0.
        self.reward = 0.
        self.prev_thrust = 0.
        self.a_prior = 0.

        # Initialize experiment variables
        self.n_ep = 1000

    def run_experiment(self):
        last_ep = 0
        training_interval = 60
        #if (self.l_mix > 3):
        #    self.l_mix = self.l_mix/1.05
        for ep in range(self.n_ep):  #TODO: Define number of iterations
            self.create_rl_command_msg(
                stamp=rospy.Time.now())  # Synchronize to state read
            self.pub.publish(self.rl_command_msg)
            if ep - last_ep == training_interval:
                last_ep = ep
                self.train_rl()

            self.rate.sleep()

    def output_command(self, s, a_prior):
        a = self.learner.actor.predict(s) + self.learner.actor_noise()
        return a / (1 + self.l_mix) + self.l_mix * a_prior / (1 + self.l_mix)

    def safety_filter(self, s, a):
        # f = np.zeros(2)
        # g = np.zeros(2)
        [f, g, x] = self.get_dynamics(s)
        u_bar = self.cbf.control_barrier(a, f, g, x)
        return a + u_bar, u_bar

    def get_dynamics(self, s):
        T = 1. / 60  # Sampling Frequency
        G = 0.  # Gravity
        m = 0.56 / 9.81  # Scaled mass
        s = np.squeeze(s)
        f = np.array([s[0] + s[1] * T, s[1] - G * T])
        g = np.array([T**2 / (2 * m), T / m])
        return [f, g, s]

    def train_rl(self):
        # Train based on replay buffer
        minibatch_size = self.minibatch_size
        if self.rl_buffer.count > 2 * minibatch_size:
            self.learner.train(self.rl_buffer, minibatch_size)  #Train

    """
    Define ROS Interface
    """

    def init_ROS(self):
        # Publish commands to topic ??
        self.pub = rospy.Publisher('rl/commands', AttitudeTarget, queue_size=2)

        rospy.init_node('ddpg_bintel', anonymous=True)
        self.rate = rospy.Rate(self.main_loop_rate)

        # - Subscribe to state (z, zdot), reward and deployed thrust messages
        rospy.Subscriber('rl/training', StateReward, self._read_environment)

    """
    Callback function to read state and output torque command
    """

    def _read_environment(self, data):
        z = data.pose.position.z
        zdot = data.velocity.linear.z
        cur_reward = data.reward.data
        dep_thrust = data.thrust.data
        t_last_msg = rospy.Time(secs=int(data.header.stamp.secs),
                                nsecs=data.header.stamp.nsecs)
        t = data.end_of_ep.data

        self.a_prior = data.prior.data

        # Add to replay buffer
        s = np.array([self.z, self.zdot
                      ])  # TODO: Set s to be state at one timestep back

        #
        a_s = np.squeeze(dep_thrust) - self.l_mix * np.squeeze(
            self.a_prior) / (1 + self.l_mix)
        a = (self.l_mix + 1) * a_s

        s2 = np.array([z, zdot])
        #self.rl_buffer.add(s, self.prev_thrust, cur_reward, t, s2)
        #self.rl_buffer.add(s, a, cur_reward, t, s2)
        self.rl_buffer.add(s, self.act, self.reward, t, s2)
        #self.create_rl_command_msg(rospy.Time.now()) #TODO: Test if creating messages here allows "parallelization"
        #self.pub.publish(self.rl_command_msg)
        self.z = z
        self.zdot = zdot
        self.act = a
        self.reward = cur_reward
        self.prev_thrust = dep_thrust

    def create_rl_command_msg(self, stamp):
        ## Set the header
        self.rl_command_msg.header.stamp = stamp
        self.rl_command_msg.header.frame_id = '/world'

        ## Set message content
        s = np.array([[self.z, self.zdot]])
        #a_rl = self.output_command(s, self.a_prior)
        ## Use cbf to filter output
        self.rl_command_msg.body_rate.z = 0.
        self.rl_command_msg.thrust = self.output_command(s, self.a_prior)