def build_main_networks(self): ################################## #### Build the learned critic #### ################################## self.critic = BuildQNetwork(self.state_placeholder, self.action_placeholder, scope='learner_critic_main') # Build the critic training function self.train_critic_one_step, self.projected_target_distribution = self.critic.generate_training_function(self.target_q_distribution_placeholder, self.target_bins_placeholder, self.importance_sampling_weights_placeholder) ################################# #### Build the learned actor #### ################################# self.actor = BuildActorNetwork(self.state_placeholder, scope='learner_actor_main') # Build the actor training function self.train_actor_one_step = self.actor.generate_training_function(self.dQ_dAction_placeholder)
def build_target_networks(self): ########################################### #### Build the target actor and critic #### ########################################### self.target_critic = BuildQNetwork(self.state_placeholder, self.action_placeholder, scope='learner_critic_target') self.target_actor = BuildActorNetwork(self.state_placeholder, scope='learner_actor_target')
def build_actor(self): # Generate the actor's policy neural network agent_name = 'agent_' + str(self.n_agent) # agent name 'agent_3', for example self.state_placeholder = tf.placeholder(dtype = tf.float32, shape = [None, Settings.OBSERVATION_SIZE], name = 'state_placeholder') # the * lets Settings.OBSERVATION_SIZE be not restricted to only a scalar ############################# #### Generate this Actor #### ############################# self.policy = BuildActorNetwork(self.state_placeholder, scope = agent_name)
class Learner: def __init__(self, sess, saver, replay_buffer, writer): print("Initialising learner...") # Saving items to the self. object for future use self.sess = sess self.saver = saver self.replay_buffer = replay_buffer self.writer = writer with tf.variable_scope("Preparing_placeholders"): # Defining placeholders for training # self.state_placeholder = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.OBSERVATION_SIZE], name = "state_placeholder") # the '*' unpacks the OBSERVATION_SIZE list (incase it's pixels of higher dimension) # self.action_placeholder = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.ACTION_SIZE], name = "action_placeholder") # placeholder for actions # self.target_bins_placeholder = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.NUMBER_OF_BINS], name = "target_bins_placeholder") # Bin values of target network with Bellman update applied # self.target_q_distribution_placeholder = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.NUMBER_OF_BINS], name = "target_q_distribution_placeholder") # Future q-distribution from target critic # self.dQ_dAction_placeholder = tf.placeholder(dtype = tf.float32, shape = [Settings.MINI_BATCH_SIZE, Settings.ACTION_SIZE], name = "dQ_dAction_placeholder") # Gradient of critic predicted value with respect to input actions # self.importance_sampling_weights_placeholder = tf.placeholder(dtype = tf.float32, shape = Settings.MINI_BATCH_SIZE, name = "importance_sampling_weights_placeholder") # [PRIORITY_REPLAY_BUFFER only] Holds the weights that are used to remove bias from priority sampling self.state_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.OBSERVATION_SIZE], name="state_placeholder" ) # the '*' unpacks the OBSERVATION_SIZE list (incase it's pixels of higher dimension) self.action_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.ACTION_SIZE], name="action_placeholder") # placeholder for actions self.target_bins_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.NUMBER_OF_BINS], name="target_bins_placeholder" ) # Bin values of target network with Bellman update applied self.target_q_distribution_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.NUMBER_OF_BINS], name="target_q_distribution_placeholder" ) # Future q-distribution from target critic self.dQ_dAction_placeholder = tf.placeholder( dtype=tf.float32, shape=[Settings.MINI_BATCH_SIZE, Settings.ACTION_SIZE], name="dQ_dAction_placeholder" ) # Gradient of critic predicted value with respect to input actions self.importance_sampling_weights_placeholder = tf.placeholder( dtype=tf.float32, shape=None, name="importance_sampling_weights_placeholder" ) # [PRIORITY_REPLAY_BUFFER only] Holds the weights that are used to remove bias from priority sampling # The reward options that the distributional critic predicts the liklihood of being in self.bins = np.linspace(Settings.MIN_V, Settings.MAX_V, Settings.NUMBER_OF_BINS, dtype=np.float32) ###################################################### ##### Build the networks and training operations ##### ###################################################### self.build_main_networks() self.build_target_networks() # Build the operation to update the target network parameters self.build_target_parameter_update_operations() # Create operstions for Tensorboard logging self.writer = writer self.create_summary_functions() print("Learner created!") def create_summary_functions(self): # Creates the operation that, when run, will log the appropriate data to tensorboard with tf.variable_scope("Logging_Learning"): # The critic loss during training is the only logged item self.iteration_loss_placeholder = tf.placeholder(tf.float32) self.iteration_loss_summary = tf.summary.scalar( "Loss", self.iteration_loss_placeholder) self.iteration_summary = tf.summary.merge( [self.iteration_loss_summary]) def build_main_networks(self): ################################## #### Build the learned critic #### ################################## self.critic = BuildQNetwork(self.state_placeholder, self.action_placeholder, scope='learner_critic_main') # Build the critic training function self.train_critic_one_step, self.projected_target_distribution = self.critic.generate_training_function( self.target_q_distribution_placeholder, self.target_bins_placeholder, self.importance_sampling_weights_placeholder) ################################# #### Build the learned actor #### ################################# self.actor = BuildActorNetwork(self.state_placeholder, scope='learner_actor_main') # Build the actor training function self.train_actor_one_step = self.actor.generate_training_function( self.dQ_dAction_placeholder) def build_target_networks(self): ########################################### #### Build the target actor and critic #### ########################################### self.target_critic = BuildQNetwork(self.state_placeholder, self.action_placeholder, scope='learner_critic_target') self.target_actor = BuildActorNetwork(self.state_placeholder, scope='learner_actor_target') def build_target_parameter_update_operations(self): # Build operations that either # 1) initialize target networks to be identical to main networks 2) slowly # 2) slowly copy main network parameters to target networks according to Settings.TARGET_NETWORK_TAU main_parameters = self.actor.parameters + self.critic.parameters target_parameters = self.target_actor.parameters + self.target_critic.parameters # Build operation that fully copies the main network parameters to the targets [Option 1 above] initialize_target_network_parameters = [] # Looping across all variables in the main critic and main actor for source_variable, destination_variable in zip( main_parameters, target_parameters): initialize_target_network_parameters.append( destination_variable.assign(source_variable)) # Build operation that slowly updates target networks according to Settings.TARGET_NETWORK_TAU [Option 2 above] update_target_network_parameters = [] # Looping across all variables in the main critic and main actor for source_variable, destination_variable in zip( main_parameters, target_parameters): # target = tau*main + (1 - tau)*target update_target_network_parameters.append( destination_variable.assign(( tf.multiply(source_variable, Settings.TARGET_NETWORK_TAU) + tf.multiply(destination_variable, 1. - Settings.TARGET_NETWORK_TAU)))) # Save both operations to self object for later use self.initialize_target_network_parameters = initialize_target_network_parameters self.update_target_network_parameters = update_target_network_parameters def generate_queue(self): # Generate the queues responsible for communicating with the learner self.agent_to_learner = multiprocessing.Queue(maxsize=1) self.learner_to_agent = multiprocessing.Queue(maxsize=1) return self.agent_to_learner, self.learner_to_agent def run(self, stop_run_flag, replay_buffer_dump_flag, starting_training_iteration): # Continuously train the actor and the critic, by applying stochastic gradient # descent to batches of data sampled from the replay buffer print("Starting to run learner at iteration %i" % starting_training_iteration) # Initializing the counter of training iterations self.total_training_iterations = starting_training_iteration # Starting time start_time = time.time() # Initialize the target networks to be identical to the main networks self.sess.run(self.initialize_target_network_parameters) # Setup priority replay buffer parameters, if used if Settings.PRIORITY_REPLAY_BUFFER: # When priority_beta = 1, the priority sampling bias is fully accounted for. # We slowly anneal priority_beta towards 1.0 over the course of training. # Lower beta allows the prioritized samples to be weighted unfairly, # but this can help training, at least initially. priority_beta = Settings.PRIORITY_BETA_START # starting beta value beta_increment = ( Settings.PRIORITY_BETA_END - Settings.PRIORITY_BETA_START ) / Settings.MAX_TRAINING_ITERATIONS # to increment on each iteration else: # If we aren't using a priority buffer, set the importance sampled weights to ones for the entire run weights_batch = np.ones(shape=Settings.MINI_BATCH_SIZE) ############################### ##### Start Training Loop ##### ############################### while self.total_training_iterations < Settings.MAX_TRAINING_ITERATIONS and not stop_run_flag.is_set( ): # Check if the agent wants some q-distributions calculated try: state_log, action_log, next_state_log, reward_log, done_log, gamma_log = self.agent_to_learner.get( False) # Reshapping gamma_log = np.reshape(gamma_log, [-1, 1]) # Get the online q-distribution critic_distribution = self.sess.run( self.critic.q_distribution, feed_dict={ self.state_placeholder: state_log, self.action_placeholder: action_log }) # [episode length, number of bins] # Clean next actions from the target actor clean_next_actions = self.sess.run( self.target_actor.action_scaled, {self.state_placeholder: next_state_log }) # [episode length, num_actions] # Get the target q-distribution target_critic_distribution = self.sess.run( self.target_critic.q_distribution, feed_dict={ self.state_placeholder: state_log, self.action_placeholder: clean_next_actions }) # [episode length, number of bins] # Create batch of bins [see further description below] target_bins = np.repeat( np.expand_dims(self.bins, axis=0), len(reward_log), axis=0) # [episode length, number_of_bins] target_bins[done_log, :] = 0.0 target_bins = np.expand_dims( reward_log, axis=1) + (target_bins * gamma_log) # Calculating the bellman distribution (r + gamma*target_q_distribution). The critic loss is with respect to this projection. projected_target_distribution = self.sess.run( self.projected_target_distribution, feed_dict={ self.target_q_distribution_placeholder: target_critic_distribution, self.target_bins_placeholder: target_bins }) # Calculating the loss at each timestep weights_batch = weights_batch = np.ones(shape=len(reward_log)) loss_log = self.sess.run( self.critic.loss, feed_dict={ self.state_placeholder: state_log, self.action_placeholder: action_log, self.target_q_distribution_placeholder: target_critic_distribution, self.target_bins_placeholder: target_bins, self.importance_sampling_weights_placeholder: weights_batch }) # Send the results back to the agent self.learner_to_agent.put( (critic_distribution, target_critic_distribution, projected_target_distribution, loss_log)) except queue.Empty: # If queue was empty, do nothing pass # If we don't have enough data yet to train OR we want to wait before we start to train if (self.replay_buffer.how_filled() < Settings.MINI_BATCH_SIZE ) or (self.replay_buffer.how_filled() < Settings.REPLAY_BUFFER_START_TRAINING_FULLNESS): continue # Skip this training iteration. Wait for more training data. # Sample a mini-batch of data from the replay_buffer if Settings.PRIORITY_REPLAY_BUFFER: sampled_batch = self.replay_buffer.sample(priority_beta) weights_batch = sampled_batch[ 6] # [priority-only data] used for removing bias in prioritized data index_batch = sampled_batch[ 7] # [priority-only data] used for updating priorities else: sampled_batch = self.replay_buffer.sample() # Unpack the training data states_batch = sampled_batch[0] actions_batch = sampled_batch[1] rewards_batch = sampled_batch[2] next_states_batch = sampled_batch[3] dones_batch = sampled_batch[4] gammas_batch = sampled_batch[5] ################################### ##### Prepare Critic Training ##### ################################### # Get clean next actions by feeding the next states through the target actor clean_next_actions = self.sess.run( self.target_actor.action_scaled, {self.state_placeholder: next_states_batch }) # [batch_size, num_actions] # Get the next q-distribution by passing the next states and clean next actions through the target critic target_critic_distribution = self.sess.run( self.target_critic.q_distribution, { self.state_placeholder: next_states_batch, self.action_placeholder: clean_next_actions }) # [batch_size, number_of_bins] # Create batch of bins target_bins = np.repeat(np.expand_dims(self.bins, axis=0), Settings.MINI_BATCH_SIZE, axis=0) # [batch_size, number_of_bins] # If this data in the batch corresponds to the end of an episode (dones_batch[i] = True), # set all the bins to 0.0. This will eliminate the inclusion of the predicted future # reward when computing the bellman update (i.e., the predicted future rewards are only # the current reward, since we aren't continuing the episode any further). target_bins[dones_batch, :] = 0.0 # Bellman projection. reward + gamma^N*bin -> The new # expected reward, according to the recently-received reward. # If the new reward is outside of the current bin, then we will # adjust the probability that is assigned to the bin. target_bins = np.expand_dims(rewards_batch, axis=1) + (target_bins * gammas_batch) ##################################### ##### TRAIN THE CRITIC ONE STEP ##### ##################################### critic_loss, _ = self.sess.run( [self.critic.loss, self.train_critic_one_step], { self.state_placeholder: states_batch, self.action_placeholder: actions_batch, self.target_q_distribution_placeholder: target_critic_distribution, self.target_bins_placeholder: target_bins, self.importance_sampling_weights_placeholder: weights_batch }) ################################## ##### Prepare Actor Training ##### ################################## # Get clean actions that the main actor would have taken for this batch of states if there were no noise added clean_actions = self.sess.run( self.actor.action_scaled, {self.state_placeholder: states_batch}) # Calculate the derivative of the main critic's q-value with respect to these actions dQ_dAction = self.sess.run( self.critic.dQ_dAction, { self.state_placeholder: states_batch, self.action_placeholder: clean_actions }) # also known as action gradients #################################### ##### TRAIN THE ACTOR ONE STEP ##### #################################### self.sess.run( self.train_actor_one_step, { self.state_placeholder: states_batch, self.dQ_dAction_placeholder: dQ_dAction[0] }) # If it's time to update the target networks if self.total_training_iterations % Settings.UPDATE_TARGET_NETWORKS_EVERY_NUM_ITERATIONS == 0: # Update target networks according to TAU! self.sess.run(self.update_target_network_parameters) # If we're using a priority buffer, tend to it now. if Settings.PRIORITY_REPLAY_BUFFER: # The priority replay buffer ranks the data according to how unexpected they were # An unexpected data point will have high loss. Now that we've just calculated the loss, # update the priorities in the replay buffer. self.replay_buffer.update_priorities( index_batch, (np.abs(critic_loss) + Settings.PRIORITY_EPSILON)) # Increment priority beta value slightly closer towards 1.0 priority_beta += beta_increment # If it's time to check if the prioritized replay buffer is overful if Settings.PRIORITY_REPLAY_BUFFER and ( self.total_training_iterations % Settings. DUMP_PRIORITY_REPLAY_BUFFER_EVER_NUM_ITERATIONS == 0): # If the buffer is overfilled if (self.replay_buffer.how_filled() > Settings.REPLAY_BUFFER_SIZE): # Make the agents wait before adding any more data to the buffer replay_buffer_dump_flag.clear() # How overful is the buffer? samples_to_remove = self.replay_buffer.how_filled( ) - Settings.REPLAY_BUFFER_SIZE # Remove the appropriate number of samples self.replay_buffer.remove(samples_to_remove) # Allow the agents to continue now that the buffer is ready replay_buffer_dump_flag.set() # If it's time to log the training performance to TensorBoard if self.total_training_iterations % Settings.LOG_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS == 0: # Logging the mean critic loss across the batch summary = self.sess.run(self.iteration_summary, feed_dict={ self.iteration_loss_placeholder: np.mean(critic_loss) }) self.writer.add_summary(summary, self.total_training_iterations) # If it's time to save a checkpoint. Be it a regular checkpoint, the final planned iteration, or the final unplanned iteration if (self.total_training_iterations % Settings.SAVE_CHECKPOINT_EVERY_NUM_ITERATIONS == 0) or (self.total_training_iterations == Settings.MAX_TRAINING_ITERATIONS ) or stop_run_flag.is_set(): # Save the state of all networks and note the training iteration self.saver.save(self.total_training_iterations, self.state_placeholder, self.actor.action_scaled) # Make the agents wait before adding any more data to the buffer replay_buffer_dump_flag.clear() # Save the replay buffer self.replay_buffer.save() # Allow the agents to continue now that the buffer is ready replay_buffer_dump_flag.set() # If it's time to print the training performance to the screen if self.total_training_iterations % Settings.DISPLAY_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS == 0: print( "Trained actor and critic %i iterations in %.2f minutes, at %.3f s/iteration. Now at iteration %i." % (Settings. DISPLAY_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS, (time.time() - start_time) / 60, (time.time() - start_time) / Settings. DISPLAY_TRAINING_PERFORMANCE_EVERY_NUM_ITERATIONS, self.total_training_iterations)) start_time = time.time( ) # resetting the timer for the next PERFORMANCE_UPDATE_EVERY_NUM_ITERATIONS of iterations # Incrementing training iteration counter self.total_training_iterations += 1 # If we are done training print("Learner finished after running " + str(self.total_training_iterations) + " training iterations!") # Flip the flag signalling all agents to stop stop_run_flag.set()
def main(): import argparse parser = argparse.ArgumentParser(description="Guided mode example") parser.add_argument("-ti", "--target_id", dest='target_id', default=0, type=int, help="Target aircraft ID") parser.add_argument("-fi", "--follower_id", dest='follower_id', default=0, type=int, help="Follower aircraft ID") parser.add_argument("-f", "--filename", dest='log_filename', default='log_000', type=str, help="Log file name") args = parser.parse_args() interface = None target_id = args.target_id follower_id = args.follower_id log_filename = args.log_filename max_duration = 100000 log_placeholder = np.zeros((max_duration, 30)) i=0 # for log increment ## Prepare Filter Specs : ## # fs = 5 # Sampling frequency # order = 2 # cutoff = 2.0 # nyq = 0.5 * fs # normal_cutoff = cutoff / nyq # b, a = signal.butter(order, normal_cutoff) # dgn = dge = dgd = signal.lfilter_zi(b, a) ### Deep guidance initialization stuff tf.reset_default_graph() # Initialize Tensorflow, and load in policy with tf.Session() as sess: # Building the policy network state_placeholder = tf.placeholder(dtype = tf.float32, shape = [None, Settings.OBSERVATION_SIZE], name = "state_placeholder") actor = BuildActorNetwork(state_placeholder, scope='learner_actor_main') # Loading in trained network weights print("Attempting to load in previously-trained model\n") saver = tf.train.Saver() # initialize the tensorflow Saver() # Try to load in policy network parameters try: ckpt = tf.train.get_checkpoint_state('../') saver.restore(sess, ckpt.model_checkpoint_path) print("\nModel successfully loaded!\n") except (ValueError, AttributeError): print("No model found... quitting :(") raise SystemExit ####################################################################### ### Guidance model is loaded, now get data and run it through model ### ####################################################################### try: start_time = time.time() g = Guidance(interface=interface, target_id=target_id, follower_id=follower_id) sleep(0.1) # g.set_guided_mode() sleep(0.2) last_target_yaw = 0.0 total_time = 0.0 if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: # Create state-augmentation queue (holds previous actions) past_actions = queue.Queue(maxsize = Settings.AUGMENT_STATE_WITH_ACTION_LENGTH) # Fill it with zeros to start for i in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH): past_actions.put(np.zeros(Settings.ACTION_SIZE), False) if Settings.AUGMENT_STATE_WITH_STATE_LENGTH > 0: # Create state-augmentation queue (holds previous raw total states) past_states = queue.Queue(maxsize = Settings.AUGMENT_STATE_WITH_STATE_LENGTH) # Fill it with zeros to start for i in range(Settings.AUGMENT_STATE_WITH_STATE_LENGTH): past_states.put(np.zeros(Settings.TOTAL_STATE_SIZE), False) while True: # TODO: make better frequency managing sleep(g.step) total_time = total_time + g.step # print('G IDS : ',g.ids) # debug.... policy_input = np.zeros(Settings.TOTAL_STATE_SIZE) # initializing policy input for rc in g.rotorcrafts: rc.timeout = rc.timeout + g.step """ policy_input is: [chaser_x, chaser_y, chaser_z, chaser_theta, target_x, target_y, target_z, target_theta, chaser_x_dot, chaser_y_dot, chaser_z_dot, chaser_theta_dot + (optional past action data)] Note: chaser_theta_dot can be returned as a zero (it is discarded before being run through the network) """ #print('rc.W',rc.W) # example to see the positions, or you can get the velocities as well... if rc.id == target_id: # we've found the target policy_input[3] = rc.X[0] # target X [north] = North policy_input[4] = -rc.X[1] # targey Y [west] = - East policy_input[5] = rc.X[2] # target Z [up] = Up policy_input[6] = np.unwrap([last_target_yaw, -rc.W[2]])[1] # target yaw [counter-clockwise] = -yaw [clockwise] last_target_yaw = policy_input[6] #print("Target position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f" %(rc.X[0], -rc.X[1], rc.X[2], -rc.W[2])) # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude if rc.id == follower_id: # we've found the chaser (follower) policy_input[0] = rc.X[0] # chaser X [north] = North policy_input[1] = -rc.X[1] # chaser Y [west] = - East policy_input[2] = rc.X[2] # chaser Z [up] = Up policy_input[7] = rc.V[0] # chaser V_x [north] = North policy_input[8] = -rc.V[1] # chaser V_y [west] = - East policy_input[9] = rc.V[2] # chaser V_z [up] = Up #print("Time: %.2f; Chaser position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f; Vx: %.2f; Vy: %.2f; Vz: %.2f" %(rc.timeout, rc.X[0], -rc.X[1], rc.X[2], -rc.W[2], rc.V[0], -rc.V[1], rc.V[2])) # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude # Augment state with past action data if applicable if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_action_data = np.asarray(past_actions.queue).reshape([-1]) # past actions reshaped into a column # Remove the oldest entry from the action log queue past_actions.get(False) # Concatenate past actions to the policy input policy_input = np.concatenate([policy_input, past_action_data]) ############################################################ ##### Received data! Process it and return the result! ##### ############################################################ # Calculating the proper policy input (deleting irrelevant states and normalizing input) # Normalizing if Settings.NORMALIZE_STATE: normalized_policy_input = (policy_input - Settings.STATE_MEAN)/Settings.STATE_HALF_RANGE else: normalized_policy_input = policy_input # Discarding irrelevant states normalized_policy_input = np.delete(normalized_policy_input, Settings.IRRELEVANT_STATES) # Reshaping the input normalized_policy_input = normalized_policy_input.reshape([-1, Settings.OBSERVATION_SIZE]) # Run processed state through the policy deep_guidance = sess.run(actor.action_scaled, feed_dict={state_placeholder:normalized_policy_input})[0] # deep guidance = [ chaser_angular_velocity [counter-clockwise looking down from above], chaser_x_acceleration [north], chaser_y_acceleration [west], chaser_z_acceleration [up] ] # Adding the action taken to the past_action log if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_actions.put(deep_guidance) # Limit guidance commands if velocity is too high! # Checking whether our velocity is too large AND the acceleration is trying to increase said velocity... in which case we set the desired_linear_acceleration to zero. current_velocity = policy_input[7:10] deep_guidance[(np.abs(current_velocity) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance[1:]) == np.sign(current_velocity))] = 0 # deep_guidance[0], dgn = signal.lfilter(b, a, [deep_guidance[0]] , zi=dgn) # deep_guidance[1], dge = signal.lfilter(b, a, [deep_guidance[1]] , zi=dge) # deep_guidance[2], dgd = signal.lfilter(b, a, [deep_guidance[2]] , zi=dgd) # deep_guidance[2], deep_guidance_prev_filt[2] = signal.lfilter(b, a, deep_guidance[2], zi=deep_guidance_prev_filt[2]) # deep_guidance[3], deep_guidance_prev_filt[3] = signal.lfilter(b, a, deep_guidance[3], zi=deep_guidance_prev_filt[3]) # Send velocity/acceleration command to aircraft! #g.move_at_ned_vel( yaw=-deep_guidance[0]) g.accelerate(north = deep_guidance[0], east = -deep_guidance[1], down = -deep_guidance[2]) #print("Deep guidance command: a_x: %.2f; a_y: %.2f; a_z: %.2f" %( deep_guidance[1], deep_guidance[2], deep_guidance[3])) print("Time: %.2f; X: %.2f; Vx: %.2f; Ax: %.2f" %(total_time, policy_input[0], policy_input[7], deep_guidance[0])) print('Deep Guidance :',deep_guidance) # print('Policy input shape :',normalized_policy_input.shape) # Log all niput and outputs: t = time.time()-start_time log_placeholder[i,0] = t log_placeholder[i,1:4] = deep_guidance # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf log_placeholder[i,4:4+len(normalized_policy_input[0])] = normalized_policy_input i += 1 except (KeyboardInterrupt, SystemExit): print('Shutting down...') g.set_nav_mode() g.shutdown() sleep(0.2) with open(log_filename+".txt", 'wb') as f: np.save(f, log_placeholder[:i]) exit()
def main(): import argparse parser = argparse.ArgumentParser(description="Guided mode example") parser.add_argument("-ti", "--target_id", dest='target_id', default=0, type=int, help="Target aircraft ID") parser.add_argument("-fi", "--follower_id", dest='follower_id', default=0, type=int, help="Follower aircraft ID") parser.add_argument("-f", "--filename", dest='log_filename', default='log_accel_000', type=str, help="Log file name") parser.add_argument("-d", "--deadband", dest='deadband_radius', default='0.0', type=float, help="deadband radius") parser.add_argument("-no_avg", "--dont_average_output", dest="dont_average_output", action="store_true") args = parser.parse_args() interface = None target_id = args.target_id follower_id = args.follower_id log_filename = args.log_filename deadband_radius = args.deadband_radius max_duration = 100000 log_placeholder = np.zeros((max_duration, 30)) i = 0 # for log increment # Flag to not average the guidance output dont_average_output = args.dont_average_output if dont_average_output: print("\n\nDeep guidance output is NOT averaged\n\n") else: print("\n\nDeep guidance output is averaged\n\n") timestep = Settings.TIMESTEP ### Deep guidance initialization stuff tf.reset_default_graph() # Initialize Tensorflow, and load in policy with tf.Session() as sess: # Building the policy network state_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.OBSERVATION_SIZE], name="state_placeholder") actor = BuildActorNetwork(state_placeholder, scope='learner_actor_main') # Loading in trained network weights print("Attempting to load in previously-trained model\n") saver = tf.train.Saver() # initialize the tensorflow Saver() # Try to load in policy network parameters try: ckpt = tf.train.get_checkpoint_state('../') saver.restore(sess, ckpt.model_checkpoint_path) print("\nModel successfully loaded!\n") except (ValueError, AttributeError): print("No model found... quitting :(") raise SystemExit ####################################################################### ### Guidance model is loaded, now get data and run it through model ### ####################################################################### try: start_time = time.time() g = Guidance(interface=interface, quad_ids=[target_id, follower_id]) sleep(0.1) # g.set_guided_mode() sleep(0.2) last_target_yaw = 0.0 total_time = 0.0 last_deep_guidance = np.zeros(Settings.ACTION_SIZE) if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: # Create state-augmentation queue (holds previous actions) past_actions = queue.Queue( maxsize=Settings.AUGMENT_STATE_WITH_ACTION_LENGTH) # Fill it with zeros to start for i in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH): past_actions.put(np.zeros(Settings.ACTION_SIZE), False) while True: # TODO: make better frequency managing sleep(timestep) total_time = total_time + timestep # print('G IDS : ',g.ids) # debug.... policy_input = np.zeros( Settings.TOTAL_STATE_SIZE) # initializing policy input for rc in g.rotorcrafts: rc.timeout = rc.timeout + timestep """ policy_input is: [chaser_x, chaser_y, chaser_z, target_x, target_y, target_z, target_theta, chaser_x_dot, chaser_y_dot, chaser_z_dot, (optional past action data)] """ #print('rc.W',rc.W) # example to see the positions, or you can get the velocities as well... if rc.id == target_id: # we've found the target policy_input[3] = rc.X[0] # target X [north] = North policy_input[4] = -rc.X[1] # targey Y [west] = - East policy_input[5] = rc.X[2] # target Z [up] = Up policy_input[6] = np.unwrap( [last_target_yaw, -rc.W[2]] )[1] # target yaw [counter-clockwise] = -yaw [clockwise] last_target_yaw = policy_input[6] #print("Target position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f" %(rc.X[0], -rc.X[1], rc.X[2], -rc.W[2])) # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude if rc.id == follower_id: # we've found the chaser (follower) policy_input[0] = rc.X[0] # chaser X [north] = North policy_input[1] = -rc.X[1] # chaser Y [west] = - East policy_input[2] = rc.X[2] # chaser Z [up] = Up policy_input[7] = rc.V[ 0] # chaser V_x [north] = North policy_input[8] = -rc.V[ 1] # chaser V_y [west] = - East policy_input[9] = rc.V[2] # chaser V_z [up] = Up #print("Time: %.2f; Chaser position: X: %.2f; Y: %.2f; Z: %.2f; Att %.2f; Vx: %.2f; Vy: %.2f; Vz: %.2f" %(rc.timeout, rc.X[0], -rc.X[1], rc.X[2], -rc.W[2], rc.V[0], -rc.V[1], rc.V[2])) # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude # Augment state with past action data if applicable if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_action_data = np.asarray(past_actions.queue).reshape( [-1]) # past actions reshaped into a column # Remove the oldest entry from the action log queue past_actions.get(False) # Concatenate past actions to the policy input policy_input = np.concatenate( [policy_input, past_action_data]) ############################################################ ##### Received data! Process it and return the result! ##### ############################################################ # Calculating the proper policy input (deleting irrelevant states and normalizing input) # Normalizing if Settings.NORMALIZE_STATE: normalized_policy_input = ( policy_input - Settings.STATE_MEAN) / Settings.STATE_HALF_RANGE else: normalized_policy_input = policy_input # Discarding irrelevant states normalized_policy_input = np.delete(normalized_policy_input, Settings.IRRELEVANT_STATES) # Reshaping the input normalized_policy_input = normalized_policy_input.reshape( [-1, Settings.OBSERVATION_SIZE]) # Run processed state through the policy deep_guidance = sess.run( actor.action_scaled, feed_dict={state_placeholder: normalized_policy_input})[0] # deep guidance = [ chaser_x_acceleration [north], chaser_y_acceleration [west], chaser_z_acceleration [up] ] # Adding the action taken to the past_action log if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_actions.put(deep_guidance) # Limit guidance commands if velocity is too high! # Checking whether our velocity is too large AND the acceleration is trying to increase said velocity... in which case we set the desired_linear_acceleration to zero. current_velocity = policy_input[7:10] deep_guidance[ (np.abs(current_velocity[0:2]) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance) == np.sign(current_velocity[0:2]) )] = 0 # If we are in the deadband, set the acceleration to zero! # desired_location = np.array([policy_input[3]+3*np.cos(policy_input[6]), policy_input[4]+3*np.sin(policy_input[6]), policy_input[5]]) # current_location = policy_input[0:3] # deep_guidance[np.abs((np.abs(current_location) - np.abs(desired_location))) < deadband_radius] = 0 average_deep_guidance = (last_deep_guidance + deep_guidance) / 2.0 last_deep_guidance = deep_guidance # Send velocity/acceleration command to aircraft! #g.accelerate(north = deep_guidance[0], east = -deep_guidance[1], down = -deep_guidance[2]) # Since we are accelerating only in 2D, we must send an altitude command. desired_altitude = 3 # [m] if dont_average_output: g.accelerate(north=deep_guidance[0], east=-deep_guidance[1], down=desired_altitude, quad_id=follower_id) else: g.accelerate(north=average_deep_guidance[0], east=-average_deep_guidance[1], down=desired_altitude, quad_id=follower_id) # Averaged #g.accelerate(north = average_deep_guidance[0], east = -average_deep_guidance[1], down = -average_deep_guidance[2], quad_id = follower_id) #g.accelerate(north = 1, east = 0.1, down = 0) print( "X: %2.2f Y: %2.2f Z: %2.2f Vx: %2.2f Vy: %2.2f Vz: %2.2f Guidance_X: %.2f, Y: %.2f, Z: Alt held at %.2f" % (policy_input[0], policy_input[1], policy_input[2], policy_input[7], policy_input[8], policy_input[9], average_deep_guidance[0], average_deep_guidance[1], desired_altitude)) # Log all input and outputs: t = time.time() - start_time log_placeholder[i, 0] = t log_placeholder[i, 1:3] = deep_guidance log_placeholder[i, 3:5] = average_deep_guidance # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf log_placeholder[i, 5:5 + len(policy_input)] = policy_input i += 1 except (KeyboardInterrupt, SystemExit): print('Shutting down...') g.shutdown() sleep(0.2) with open(log_filename + ".txt", 'wb') as f: np.save(f, log_placeholder[:i]) exit()
def main(): import argparse parser = argparse.ArgumentParser(description="Guided mode example") parser.add_argument("-ti", "--target_id", dest='target_id', default=0, type=int, help="Target aircraft ID") parser.add_argument("-fi", "--follower_id", dest='follower_id', default=0, type=int, help="Follower aircraft ID") parser.add_argument("-f", "--filename", dest='log_filename', default='log_velocity_000', type=str, help="Log file name") args = parser.parse_args() interface = None target_id = args.target_id follower_id = args.follower_id log_filename = args.log_filename max_duration = 100000 log_placeholder = np.zeros((max_duration, 30)) i = 0 # for log increment timestep = Settings.TIMESTEP ### Deep guidance initialization stuff tf.reset_default_graph() # Initialize Tensorflow, and load in policy with tf.Session() as sess: # Building the policy network state_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.OBSERVATION_SIZE], name="state_placeholder") actor = BuildActorNetwork(state_placeholder, scope='learner_actor_main') # Loading in trained network weights print("Attempting to load in previously-trained model\n") saver = tf.train.Saver() # initialize the tensorflow Saver() # Try to load in policy network parameters try: ckpt = tf.train.get_checkpoint_state('../') saver.restore(sess, ckpt.model_checkpoint_path) print("\nModel successfully loaded!\n") except (ValueError, AttributeError): print("No model found... quitting :(") raise SystemExit ####################################################################### ### Guidance model is loaded, now get data and run it through model ### ####################################################################### ### End deep guidance initialization stuff try: start_time = time.time() g = Guidance(interface=interface, quad_ids=[target_id, follower_id]) sleep(0.1) g.set_guided_mode(quad_id=follower_id) sleep(0.2) last_target_yaw = 0.0 if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: # Create state-augmentation queue (holds previous actions) past_actions = queue.Queue( maxsize=Settings.AUGMENT_STATE_WITH_ACTION_LENGTH) # Fill it with zeros to start for i in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH): past_actions.put(np.zeros(Settings.ACTION_SIZE), False) while True: # TODO: make better frequency managing sleep(timestep) # print('G IDS : ',g.ids) # debug.... policy_input = np.zeros( Settings.TOTAL_STATE_SIZE) # initializing policy input for rc in g.rotorcrafts: rc.timeout = rc.timeout + timestep # print('rc.id',rc.id) #print('rc.W',rc.W) # example to see the positions, or you can get the velocities as well... if rc.id == target_id: # we've found the target policy_input[3] = rc.X[0] # target X [north] = North policy_input[4] = -rc.X[1] # targey Y [west] = - East policy_input[5] = rc.X[2] # target Z [up] = Up policy_input[6] = np.unwrap( [last_target_yaw, -rc.W[2]] )[1] # target yaw [counter-clockwise] = -yaw [clockwise] last_target_yaw = policy_input[6] # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude if rc.id == follower_id: # we've found the chaser (follower) policy_input[0] = rc.X[0] # chaser X [north] = North policy_input[1] = -rc.X[1] # chaser Y [west] = - East policy_input[2] = rc.X[2] # chaser Z [up] = Up # Note: rc.X returns position; rc.V returns velocity; rc.W returns attitude # Augment state with past action data if applicable if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_action_data = np.asarray(past_actions.queue).reshape( [-1]) # past actions reshaped into a column # Remove the oldest entry from the action log queue past_actions.get(False) # Concatenate past actions to the policy input policy_input = np.concatenate( [policy_input, past_action_data]) ############################################################ ##### Received data! Process it and return the result! ##### ############################################################ # Calculating the proper policy input (deleting irrelevant states and normalizing input) # Normalizing if Settings.NORMALIZE_STATE: normalized_policy_input = ( policy_input - Settings.STATE_MEAN) / Settings.STATE_HALF_RANGE else: normalized_policy_input = policy_input # Discarding irrelevant states normalized_policy_input = np.delete(normalized_policy_input, Settings.IRRELEVANT_STATES) # Reshaping the input normalized_policy_input = normalized_policy_input.reshape( [-1, Settings.OBSERVATION_SIZE]) # Run processed state through the policy deep_guidance = sess.run( actor.action_scaled, feed_dict={state_placeholder: normalized_policy_input})[0] # deep guidance = [chaser_x_velocity [north], chaser_y_velocity [west], chaser_z_velocity [up], chaser_angular_velocity [counter-clockwise looking down from above]] # Adding the action taken to the past_action log if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_actions.put(deep_guidance) # Send velocity command to aircraft! g.move_at_ned_vel(north=deep_guidance[0], east=-deep_guidance[1], down=0, quad_id=follower_id) print("Policy input: ", policy_input, "Deep guidance command: ", deep_guidance) # Log all input and outputs: t = time.time() - start_time log_placeholder[i, 0] = t log_placeholder[i, 1:3] = deep_guidance log_placeholder[i, 3:5] = deep_guidance # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf log_placeholder[i, 5:5 + len(policy_input)] = policy_input i += 1 except (KeyboardInterrupt, SystemExit): print('Shutting down...') g.set_nav_mode(quad_id=follower_id) g.shutdown() sleep(0.2) with open(log_filename + ".txt", 'wb') as f: np.save(f, log_placeholder[:i]) exit()
def main(): import argparse parser = argparse.ArgumentParser(description="Guided mode example") parser.add_argument('-ids', '--quad_ids', nargs='+', help='<Required> IDs of all quads used', required=True) parser.add_argument("-f", "--filename", dest='log_filename', default='log_runway_000', type=str, help="Log file name") parser.add_argument("-L", "--new_length", dest='new_length', default=Settings.RUNWAY_LENGTH, type=float, help="Override the 124 m runway length") parser.add_argument("-W", "--new_width", dest='new_width', default=Settings.RUNWAY_WIDTH, type=float, help="Override the 12.5 m runway width") parser.add_argument("-alt", "--altitude", dest='altitude', default=2, type=float, help="Override the 2 m first quad altitude") parser.add_argument("-no_avg", "--dont_average_output", dest="dont_average_output", action="store_true") parser.add_argument("-fail", "--failure_times", nargs='+', dest="failure_times", default=[9999.9]) args = parser.parse_args() failure_times = list(map(float, args.failure_times)) if Settings.INDOORS: runway_angle_wrt_north = 0 else: runway_angle_wrt_north = (156.485 - 90) * np.pi / 180 #[rad] C_bI = make_C_bI( runway_angle_wrt_north ) # [3x3] rotation matrix from North (I) to body (tilted runway) C_bI_22 = make_C_bI_22( runway_angle_wrt_north ) # [2x2] rotation matrix from North (I) to body (tilted runway) if args.new_length != Settings.RUNWAY_LENGTH: print( "\nOverwriting %.1f m runway length with user-defined %.1f m runway length." % (Settings.RUNWAY_LENGTH, args.new_length)) runway_length = args.new_length if args.new_width != Settings.RUNWAY_WIDTH: print( "\nOverwriting %.1f m runway width with user-defined %.1f m runway width." % (Settings.RUNWAY_WIDTH, args.new_width)) runway_width = args.new_width first_quad_altitude = args.altitude # Communication delay length in timesteps COMMUNICATION_DELAY_LENGTH = 0 # timesteps if COMMUNICATION_DELAY_LENGTH > 0: print("\nA simulated communication delay of %.1f seconds is used" % (COMMUNICATION_DELAY_LENGTH * Settings.TIMESTEP)) interface = None not_done = True failure_acknowledged = np.tile(False, len(failure_times)) average_deep_guidance_NED = np.zeros( [Settings.NUMBER_OF_QUADS, Settings.ACTION_SIZE]) # converting this input from a list of strings to a list of ints all_ids = list(map(int, args.quad_ids)) log_filename = args.log_filename max_duration = 100000 log_placeholder = np.zeros( (max_duration, 3 * Settings.NUMBER_OF_QUADS + 1 + Settings.TOTAL_STATE_SIZE + 6)) log_counter = 0 # for log increment # Flag to not average the guidance output dont_average_output = args.dont_average_output if dont_average_output: print("\n\nDeep guidance output is NOT averaged\n\n") else: print("\n\nDeep guidance output is averaged\n\n") timestep = Settings.TIMESTEP # Generate Polygons for runway tiles # The size of each runway grid element each_runway_length_element = Settings.RUNWAY_LENGTH / Settings.RUNWAY_LENGTH_ELEMENTS each_runway_width_element = Settings.RUNWAY_WIDTH / Settings.RUNWAY_WIDTH_ELEMENTS tile_polygons = [] for i in range(Settings.RUNWAY_LENGTH_ELEMENTS): this_row = [] for j in range(Settings.RUNWAY_WIDTH_ELEMENTS): # make the polygon this_row.append( Polygon([[ each_runway_length_element * i - Settings.RUNWAY_LENGTH / 2, each_runway_width_element * j - Settings.RUNWAY_WIDTH / 2 ], [ each_runway_length_element * (i + 1) - Settings.RUNWAY_LENGTH / 2, each_runway_width_element * j - Settings.RUNWAY_WIDTH / 2 ], [ each_runway_length_element * (i + 1) - Settings.RUNWAY_LENGTH / 2, each_runway_width_element * (j + 1) - Settings.RUNWAY_WIDTH / 2 ], [ each_runway_length_element * i - Settings.RUNWAY_LENGTH / 2, each_runway_width_element * (j + 1) - Settings.RUNWAY_WIDTH / 2 ]])) tile_polygons.append(this_row) ### Deep guidance initialization stuff tf.reset_default_graph() # Initialize Tensorflow, and load in policy with tf.Session() as sess: # Building the policy network state_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.OBSERVATION_SIZE], name="state_placeholder") actor = BuildActorNetwork(state_placeholder, scope='learner_actor_main') # Loading in trained network weights print("Attempting to load in previously-trained model\n") saver = tf.train.Saver() # initialize the tensorflow Saver() # Try to load in policy network parameters try: ckpt = tf.train.get_checkpoint_state('../') saver.restore(sess, ckpt.model_checkpoint_path) print("\nModel successfully loaded!\n") except (ValueError, AttributeError): print("No model found... quitting :(") raise SystemExit ####################################################################### ### Guidance model is loaded, now get data and run it through model ### ####################################################################### try: start_time = time.time() g = Guidance(interface=interface, quad_ids=all_ids) sleep(0.1) # g.set_guided_mode() sleep(0.2) total_time = 0.0 last_deep_guidance = np.zeros(Settings.ACTION_SIZE) if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: # Create state-augmentation queue (holds previous actions) past_actions = queue.Queue( maxsize=Settings.AUGMENT_STATE_WITH_ACTION_LENGTH) # Fill it with zeros to start for j in range(Settings.AUGMENT_STATE_WITH_ACTION_LENGTH): past_actions.put( np.zeros( [Settings.NUMBER_OF_QUADS, Settings.ACTION_SIZE]), False) # Initializing runway_state = np.zeros([ Settings.RUNWAY_LENGTH_ELEMENTS, Settings.RUNWAY_WIDTH_ELEMENTS ]) # A list of all the tiles that rewards should not be returned for if Settings.RUNWAY_SHAPE == 'R': blank_tiles = [] print("Rectangular-shaped runway is used!") elif Settings.RUNWAY_SHAPE == 'L': blank_tiles = list( ([2, 0], [3, 0], [4, 0], [5, 0], [6, 0], [7, 0], [2, 1], [3, 1], [4, 1], [5, 1], [6, 1], [7, 1])) print("L-shaped runway is used!") elif Settings.RUNWAY_SHAPE == 'C': blank_tiles = list( ([2, 0], [3, 0], [4, 0], [5, 0], [2, 1], [3, 1], [4, 1], [5, 1])) print("C-shaped runway is used!") # Assign tiles we don't care about to 1 (meaning they won't give any reward) for i in range(len(blank_tiles)): runway_state[blank_tiles[i][0], blank_tiles[i][1]] = 1 last_runway_state = np.copy(runway_state) desired_altitudes = np.linspace(first_quad_altitude + Settings.NUMBER_OF_QUADS * 2, first_quad_altitude, Settings.NUMBER_OF_QUADS, endpoint=False) while not_done: # TODO: make better frequency managing sleep(timestep) # Initializing quadrotor positions and velocities quad_positions = np.zeros([Settings.NUMBER_OF_QUADS, 3]) quad_velocities = np.zeros([Settings.NUMBER_OF_QUADS, 3]) quad_number_not_id = 0 for rc in g.rotorcrafts: rc.timeout = rc.timeout + timestep """ policy_input is: [chaser_x, chaser_y, chaser_z, target_x, target_y, target_z, target_theta, chaser_x_dot, chaser_y_dot, chaser_z_dot, (optional past action data)] """ # Extracting position quad_positions[quad_number_not_id, 0] = rc.X[0] quad_positions[quad_number_not_id, 1] = -rc.X[1] quad_positions[quad_number_not_id, 2] = rc.X[2] # Rotating from Inertial frame (NED frame) into body frame (runway frame) quad_positions[quad_number_not_id, :] = np.matmul( C_bI, quad_positions[quad_number_not_id, :]) # Scale position after rotating quad_positions[quad_number_not_id, 0] = quad_positions[ quad_number_not_id, 0] * Settings.RUNWAY_LENGTH / runway_length # scaling to the new runway length quad_positions[quad_number_not_id, 1] = quad_positions[ quad_number_not_id, 1] * Settings.RUNWAY_WIDTH / runway_width # scaling to the new runway wodth # Extracting velocity quad_velocities[quad_number_not_id, 0] = rc.V[ 0] #*Settings.RUNWAY_LENGTH/runway_length quad_velocities[ quad_number_not_id, 1] = -rc.V[1] #*Settings.RUNWAY_WIDTH/runway_width quad_velocities[quad_number_not_id, 2] = rc.V[2] # Rotating from Inertial frame (NED frame) into body frame (runway frame) quad_velocities[quad_number_not_id, :] = np.matmul( C_bI, quad_velocities[quad_number_not_id, :]) # Scale velocities after rotating quad_velocities[quad_number_not_id, 0] = quad_velocities[ quad_number_not_id, 0] * Settings.RUNWAY_LENGTH / runway_length quad_velocities[quad_number_not_id, 1] = quad_velocities[ quad_number_not_id, 1] * Settings.RUNWAY_WIDTH / runway_width quad_number_not_id += 1 # Resetting the action delay queue if total_time == 0.0: if COMMUNICATION_DELAY_LENGTH > 0: communication_delay_queue = queue.Queue( maxsize=COMMUNICATION_DELAY_LENGTH + 1) # Fill it with zeros initially for i in range(COMMUNICATION_DELAY_LENGTH): communication_delay_queue.put( [quad_positions, quad_velocities], False) # Resetting the initial previous position to be the first position previous_quad_positions = quad_positions # Checking if a quadrotor has failed for i in range(len(failure_times)): if (total_time > failure_times[i]) and ( not failure_acknowledged[i]): if total_time == failure_times[i]: print("\n\nQuad %i has failed!\n\n" % i) # Force the position and velocity to their 'failed' states quad_positions[i, :] = Settings.LOWER_STATE_BOUND[:3] previous_quad_positions[i, :] = quad_positions[i, :] quad_velocities[i, :] = np.zeros([3]) if COMMUNICATION_DELAY_LENGTH > 0: communication_delay_queue.put( [quad_positions, quad_velocities], False ) # puts the current position and velocity to the bottom of the stack delayed_quad_positions, delayed_quad_velocities = communication_delay_queue.get( False) # grabs the delayed position and velocity. ############################################################## ### Check the runway for new tiles that have been explored ### ############################################################## # Generate quadrotor LineStrings for i in range(Settings.NUMBER_OF_QUADS): quad_line = LineString([ quad_positions[i, :-1], previous_quad_positions[i, :-1] ]) for j in range(Settings.RUNWAY_LENGTH_ELEMENTS): for k in range(Settings.RUNWAY_WIDTH_ELEMENTS): # If this element has already been explored, skip it if runway_state[j, k] == 0 and quad_line.intersects( tile_polygons[j][k]): runway_state[j, k] = 1 #print("Quad %i traced the line %s and explored runway element length = %i, width = %i with coordinates %s" %(i,list(quad_line.coords),j,k,tile_polygons[j][k].bounds)) # Storing current quad positions for the next timestep previous_quad_positions = quad_positions # Print if a new tile has been explored if np.any(last_runway_state != runway_state): print( "Runway elements discovered %i/%i" % (np.sum(runway_state), Settings.RUNWAY_LENGTH_ELEMENTS * Settings.RUNWAY_WIDTH_ELEMENTS)) # Draw a new runway print(np.flip(runway_state)) if np.all(runway_state) == 1: print( "Explored the entire runway in %.2f seconds--Congratualtions! Quitting deep guidance" % (time.time() - start_time)) not_done = False total_states = [] # Building NUMBER_OF_QUADS states for j in range(Settings.NUMBER_OF_QUADS): # Start state with your own this_quads_state = np.concatenate( [quad_positions[j, :], quad_velocities[j, :]]) # Add in the others' states, starting with the next quad and finishing with the previous quad for k in range(j + 1, Settings.NUMBER_OF_QUADS + j): if COMMUNICATION_DELAY_LENGTH > 0: this_quads_state = np.concatenate([ this_quads_state, delayed_quad_positions[ k % Settings.NUMBER_OF_QUADS, :], delayed_quad_velocities[ k % Settings.NUMBER_OF_QUADS, :] ]) else: this_quads_state = np.concatenate([ this_quads_state, quad_positions[k % Settings.NUMBER_OF_QUADS, :], quad_velocities[k % Settings.NUMBER_OF_QUADS, :] ]) # All quad data is included, now append the runway state and save it to the total_state total_states.append(this_quads_state) # Augment total_state with past actions, if appropriate if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: # total_states = [Settings.NUMBER_OF_QUADS, Settings.TOTAL_STATE_SIZE] # Just received a total_state from the environment, need to augment # it with the past action data and return it # The past_action_data is of shape [Settings.AUGMENT_STATE_WITH_ACTION_LENGTH, Settings.NUMBER_OF_QUADS, Settings.TOTAL_STATE_SIZE] # I swap the first and second axes so that I can reshape it properly past_action_data = np.swapaxes( np.asarray(past_actions.queue), 0, 1).reshape([ Settings.NUMBER_OF_QUADS, -1 ]) # past actions reshaped into rows for each quad total_states = np.concatenate( [np.asarray(total_states), past_action_data], axis=1) # Remove the oldest entry from the action log queue past_actions.get(False) # Concatenating the runway to the augmented state total_states = np.concatenate([ total_states, np.tile(runway_state.reshape(-1), (Settings.NUMBER_OF_QUADS, 1)) ], axis=1) total_state_to_log = total_states[0, :] # Normalize the state if Settings.NORMALIZE_STATE: total_states = (total_states - Settings.STATE_MEAN ) / Settings.STATE_HALF_RANGE # Discarding irrelevant states observations = np.delete(total_states, Settings.IRRELEVANT_STATES, axis=1) # Run processed state through the policy deep_guidance = sess.run( actor.action_scaled, feed_dict={state_placeholder: observations} ) # deep guidance = [ chaser_x_acceleration [runway north], chaser_y_acceleration [runway west], chaser_z_acceleration [up] ] # Adding the action taken to the past_action log if Settings.AUGMENT_STATE_WITH_ACTION_LENGTH > 0: past_actions.put(deep_guidance) # Limit guidance commands if velocity is too high! # Checking whether our velocity is too large AND the acceleration is trying to increase said velocity... in which case we set the desired_linear_acceleration to zero. for j in range(Settings.NUMBER_OF_QUADS): #print(quad_velocities[j,0:2], (np.abs(quad_velocities[j,0:2]) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance[j,:]) == np.sign(quad_velocities[j,0:2])), "Unsaturated: ", deep_guidance, end=' ') deep_guidance[ j, (np.abs(quad_velocities[ j, 0:2]) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance[j, :]) == np.sign(quad_velocities[j, 0:2]))] = -2 * np.sign( deep_guidance[j, (np.abs(quad_velocities[ j, 0:2]) > Settings.VELOCITY_LIMIT) & (np.sign(deep_guidance[j, :]) == np.sign(quad_velocities[j, 0:2]))]) #print("Saturated: ", deep_guidance, end =' ') average_deep_guidance = (last_deep_guidance + deep_guidance) / 2.0 last_deep_guidance = deep_guidance last_runway_state = np.copy(runway_state) # Get each quad to accelerate appropriately for j in range(Settings.NUMBER_OF_QUADS): # Checking if a quadrotor has failed for i in range(len(failure_times)): if (total_time > failure_times[i]) and (i == j): if np.abs(total_time - failure_times[i]) < 0.5: print("\n\nSimulated failure of quad %i\n\n" % (all_ids[i])) skip_this_one = True break else: skip_this_one = False if skip_this_one: continue # Each quad is assigned a different altitude to remain at. desired_altitude = desired_altitudes[j] # Rotate desired deep guidance command from body frame (runway frame) frame to inertial frame (NED frame) #print("Unrotated average: ", average_deep_guidance) average_deep_guidance_NED[j, :] = np.matmul( C_bI_22.T, average_deep_guidance[j, :]) #print("Rotated average: ", average_deep_guidance) if dont_average_output: g.accelerate(north=deep_guidance[j, 0], east=-deep_guidance[j, 1], down=desired_altitude, quad_id=g.ids[j]) print("The no_avg setting is broken") raise SystemExit else: g.accelerate(north=average_deep_guidance_NED[j, 0], east=-average_deep_guidance_NED[j, 1], down=desired_altitude, quad_id=g.ids[j]) # Averaged total_time = total_time + timestep # Log all input and outputs: t = time.time() - start_time log_placeholder[log_counter, 0] = t log_placeholder[log_counter, 1:3 * Settings.NUMBER_OF_QUADS + 1] = np.concatenate([ average_deep_guidance.reshape(-1), desired_altitudes.reshape(-1) ]) # log_placeholder[i,5:8] = deep_guidance_xf, deep_guidance_yf, deep_guidance_zf log_placeholder[log_counter, 3 * Settings.NUMBER_OF_QUADS + 1:3 * Settings.NUMBER_OF_QUADS + 1 + Settings.TOTAL_STATE_SIZE + 6] = total_state_to_log log_counter += 1 # If we ended gracefully exit() # If we ended forcefully except (KeyboardInterrupt, SystemExit): print('Shutting down...') g.shutdown() sleep(0.2) print("Saving file as %s..." % (log_filename + "_L" + str(runway_length) + "_W" + str(runway_width) + ".txt")) with open( log_filename + "_L" + str(runway_length) + "_W" + str(runway_width) + ".txt", 'wb') as f: np.save(f, log_placeholder[:log_counter]) print("Done!") exit()
def __init__(self, testing, client_socket, messages_to_deep_guidance, stop_run_flag): print("Initializing deep guidance model runner") self.client_socket = client_socket self.messages_to_deep_guidance = messages_to_deep_guidance self.stop_run_flag = stop_run_flag self.testing = testing ############################### ### User-defined parameters ### ############################### self.offset_x = 0 # Docking offset in the body frame self.offset_y = 0 # Docking offset in the body frame self.offset_angle = 0 # So we can access old Pi positions while we wait for new SPOTNet images to come in self.pi_position_queue = deque(maxlen=round(CAMERA_PROCESSING_TIME / PHASESPACE_TIMESTEP)) # Loading the queue up with zeros for i in range(round(CAMERA_PROCESSING_TIME / PHASESPACE_TIMESTEP)): self.pi_position_queue.append((0., 0., 0.)) # Holding the previous position so we know when SPOTNet gives a new update self.previousSPOTNet_relative_x = 0.0 """ Holding the chaser's x, y, and theta position when a SPOTNet image was taken which we assume is at the same moment the previous results are received. This is to ensure the ~0.7 s SPOTNet processing time isn't poorly reflected in the target's estimated inertial position """ self.chaser_x_when_image_was_taken = 0 self.chaser_y_when_image_was_taken = 0 self.chaser_theta_when_image_was_taken = 0 self.SPOTNet_target_x_inertial = 0 self.SPOTNet_target_y_inertial = 0 self.SPOTNet_target_angle_inertial = 0 # Uncomment this on TF2.0 # tf.compat.v1.disable_eager_execution() # Clear any old graph tf.reset_default_graph() # Initialize Tensorflow, and load in policy self.sess = tf.Session() # Building the policy network self.state_placeholder = tf.placeholder( dtype=tf.float32, shape=[None, Settings.OBSERVATION_SIZE], name="state_placeholder") self.actor = BuildActorNetwork(self.state_placeholder, scope='learner_actor_main') # Loading in trained network weights print("Attempting to load in previously-trained model\n") saver = tf.train.Saver() # initialize the tensorflow Saver() # Try to load in policy network parameters try: ckpt = tf.train.get_checkpoint_state('../') saver.restore(self.sess, ckpt.model_checkpoint_path) print("\nModel successfully loaded!\n") except (ValueError, AttributeError): print("No model found... quitting :(") raise SystemExit print("Done initializing model!")