def default_remove_vessels(overlord, vessel_handlers): overlord.logger.debug('Checking for stopped vessels') # Remove any stopped vessels. stopped_vessels = [] for vessel in vessel_handlers: try: vessel_status = explib.get_vessel_status( vessel, overlord.config['identity']) except: stopped_vessels.append(vessel) else: if vessel_status != explib.VESSEL_STATUS_STARTED: stopped_vessels.append(vessel) if len(stopped_vessels) > 0: overlord.logger.info('Releasing ' + str(len(stopped_vessels)) + ' stopped vessels') overlord.release_vessels(stopped_vessels) # Remove released vessels from the list. vessel_handlers = overlord.list_difference(vessel_handlers, stopped_vessels) # Log the current number of running vessels. overlord.logger.info('Currently have ' + str(len(vessel_handlers)) + ' running vessels') return vessel_handlers
def default_remove_vessels(overlord, vessel_handlers): overlord.logger.debug("Checking for stopped vessels") # Remove any stopped vessels. stopped_vessels = [] for vessel in vessel_handlers: try: vessel_status = explib.get_vessel_status(vessel, overlord.config["identity"]) except: stopped_vessels.append(vessel) else: if vessel_status != explib.VESSEL_STATUS_STARTED: stopped_vessels.append(vessel) if len(stopped_vessels) > 0: overlord.logger.info("Releasing " + str(len(stopped_vessels)) + " stopped vessels") overlord.release_vessels(stopped_vessels) # Remove released vessels from the list. vessel_handlers = overlord.list_difference(vessel_handlers, stopped_vessels) # Log the current number of running vessels. overlord.logger.info("Currently have " + str(len(vessel_handlers)) + " running vessels") return vessel_handlers
def _check_vessel_status_change(vesselhandle, monitordict): """ Checks the status of an individual vessel and calls the registered callback function for the monitor if the vessel's status has changed since the last time it was checked. """ try: # When the monitor is removed/canceled, the parallelized function isn't # aborted and we instead just have each of these calls immediately return. if monitordict['canceled']: return datadict = monitordict['vessels'][vesselhandle] if 'status' not in datadict: datadict['status'] = '' old_data = datadict.copy() status = experimentlib.get_vessel_status(vesselhandle, monitordict['identity']) datadict['status'] = status # No matter where the above try block returned from, we want to see if # the vessel data changed and call the user's callback if it has. new_data = datadict.copy() # Note that by not letting the lock go before we call the user's callback # function, the processing of all of the vessels will slow down but we # avoid requiring the user to handle locking to protect against another # call to the callback for the same vessel. if old_data['status'] != new_data['status']: try: # TODO: make sure that exception's from the user's code end up # somewhere where the user has access to them. For now, we leave it to # the user to make sure they handle exceptions rather than let them # escape their callback and this is documented in the docstring of # the function register_vessel_status_monitor. monitordict['callback'](vesselhandle, old_data['status'], new_data['status']) except Exception: _debug_print("Exception occurred in vessel status change callback:") _debug_print(traceback.format_exc()) # In order to prevent repeating failures, we remove the vesselhandle # from the monitor's list if the status indicates a positive response. # This means that scripts should occasionally add their known active # vessels to the monitor to prevent temporary failures from causing the # vessel to be subsequently ignored forever. if status in experimentlib.VESSEL_STATUS_SET_INACTIVE: _monitor_lock.acquire() try: monitordict['vesselhandle_list'].remove(vesselhandle) # We don't "del monitordict['vessels'][vesselhandle]" because it # doesn't hurt anything to leave it other than taking up a bit of # space, and it feels safer to leave it there just in case, for # example, this code got changed to put the "remove" call in the # try block above when access to the vessel's lock is still needed. finally: _monitor_lock.release() except Exception: _debug_print(traceback.format_exc())
def run(*args): """ <Purpose> Starts the deployment and monitoring of a service on a number of vessels. Handles all acquisition of, uploading to, starting, and release of vessels. Contains the main loop of this program, and is thus the final function to call in all client programs. Requires init() to have been called prior to running. <Arguments> *args <Exceptions> None <Side Effects> Persistently writes to a log file. <Returns> None """ # Write logfile header config['logfile'] = open(config['logfilename'], 'w') config['logfile'].write('################################################\n') config['logfile'].write('## Overlord Deployment and Monitoring Log ##\n') config['logfile'].write('################################################\n\n') config['logfile'].write('GENI user: '******'identity']['username'] + '\n') config['logfile'].write('Vessels to monitor: ' + str(config['vesselcount']) + '\n') config['logfile'].write('Time of script start: ' + str(time.time()) + '\n\n') config['logfile'].flush() # Release any preallocated vessels vesselhandle_list = explib.seattlegeni_get_acquired_vessels(config['identity']) release_vessels(vesselhandle_list, 'Releasing ' + str(len(vesselhandle_list)) + ' preallocated vessels...') # Acquire an initial sample of vessels config['logfile'].write(str(time.time()) + ': Fetching initial batch of ' + str(config['vesselcount']) + ' vessels:\n') config['logfile'].flush() vesselhandle_list = [] while not vesselhandle_list: vesselhandle_list = acquire_vessels(config['vesselcount']) # Upload program to vessels vesselhandle_list = upload_to_vessels(vesselhandle_list, config['program_filename']) # Run program on vessels vesselhandle_list, failed_list = run_on_vessels(vesselhandle_list, config['program_filename'], *args) # Release any failed vessels if failed_list: config['logfile'].write(str(time.time()) + ': Running ' + config['program_filename'] + ' failed on ' + str(len(failed_list)) + ' vessels\n') # Get details about failed vessel(s) and log them for vh in failed_list: try: vessel_log = explib.get_vessel_log(vh, config['identity']) except: vessel_log = '[ERROR: vessel log fetch failed]' nodeid, vesselname = explib.get_nodeid_and_vesselname(vh) nodelocation = explib.get_node_location(nodeid) # Log the vessel's log contents config['logfile'].write('Log contents of failed vessel at ' + nodelocation + ': ' + vessel_log + '\n') config['logfile'].flush() # Release the failed vessels release_vessels(failed_list, 'Releasing failed vessel(s)...') # Initialize counter variable for loop iterations loop_iterations = 0 PREPPED = True print "PREPPED!" print "Vessel Handles: %s" % vesselhandle_list # Main loop while KEEP_RUNNING == True: print "Starting Loop!" # Check for vessels not in started state stopped_vessel_list = [] for vh in vesselhandle_list: try: vessel_status = explib.get_vessel_status(vh, config['identity']) log = explib.get_vessel_log(vh, config['identity']) print "Loop Log: %s" % log except: # Node lookup failed, so remove vessel from vesselhandle_list # TODO: proper way to handle failed advertisements? stopped_vessel_list.append(vh) else: if vessel_status != explib.VESSEL_STATUS_STARTED: stopped_vessel_list.append(vh) # Release and replace any stopped vessels if stopped_vessel_list: # Release any stopped vessels release_vessels(stopped_vessel_list, 'Releasing ' + str(len(stopped_vessel_list)) + ' stopped vessel(s)...') # Remove released vessels from vesselhandle_list vesselhandle_list = list_difference(vesselhandle_list, stopped_vessel_list) # Ensure that enough vessels are running if len(vesselhandle_list) < config['vesselcount']: # If there aren't enough active vessels, acquire some config['logfile'].write(str(time.time()) + ': Only ' + str(len(vesselhandle_list)) + ' vessel(s) out of target ' + str(config['vesselcount']) + ' detected\n') config['logfile'].flush() fresh_vessels = acquire_vessels(config['vesselcount'] - len(vesselhandle_list)) # Upload and run program to/on fresh vessels fresh_vessels = upload_to_vessels(fresh_vessels, config['program_filename']) success_list, failed_list = run_on_vessels(fresh_vessels, config['program_filename'], *args) # Release any failed vessels if failed_list: config['logfile'].write(str(time.time()) + ': Running ' + config['program_filename'] + ' failed on ' + str(len(failed_list)) + ' vessels\n') # Get details about failed vessel(s) and log them for vh in failed_list: try: vessel_log = explib.get_vessel_log(vh, config['identity']) except: vessel_log = '[ERROR: vessel log fetch failed]' nodeid, vesselname = explib.get_nodeid_and_vesselname(vh) nodelocation = explib.get_node_location(nodeid) # Log the vessel's log contents config['logfile'].write('Log contents of failed vessel at ' + nodelocation + ': ' + vessel_log + '\n') config['logfile'].flush() # Release the failed vessels release_vessels(failed_list, 'Releasing failed vessel(s)...') # Remove released vessels from fresh_vessels list fresh_vessels = list_difference(fresh_vessels, failed_list) # Add fresh_vessels to vesselhandle_list vesselhandle_list.extend(fresh_vessels) # Sleep for parameterized amount of time time.sleep(VESSEL_POLLING_TIME) # Log a liveness message every certain number of iterations loop_iterations += 1 if loop_iterations % LOG_AFTER_THIS_MANY_LOOPS == 0: config['logfile'].write(str(time.time()) + ': Still alive...\n') config['logfile'].flush() # Renew vessels according to constant period if loop_iterations * VESSEL_POLLING_TIME > VESSEL_RENEWAL_PERIOD: explib.seattlegeni_renew_vessels(config['identity'], vesselhandle_list) loop_iterations = 0
def _check_vessel_status_change(vesselhandle, monitordict): """ Checks the status of an individual vessel and calls the registered callback function for the monitor if the vessel's status has changed since the last time it was checked. """ try: # When the monitor is removed/canceled, the parallelized function isn't # aborted and we instead just have each of these calls immediately return. if monitordict['canceled']: return datadict = monitordict['vessels'][vesselhandle] if 'status' not in datadict: datadict['status'] = '' old_data = datadict.copy() status = experimentlib.get_vessel_status(vesselhandle, monitordict['identity']) datadict['status'] = status # No matter where the above try block returned from, we want to see if # the vessel data changed and call the user's callback if it has. new_data = datadict.copy() # Note that by not letting the lock go before we call the user's callback # function, the processing of all of the vessels will slow down but we # avoid requiring the user to handle locking to protect against another # call to the callback for the same vessel. if old_data['status'] != new_data['status']: try: # TODO: make sure that exception's from the user's code end up # somewhere where the user has access to them. For now, we leave it to # the user to make sure they handle exceptions rather than let them # escape their callback and this is documented in the docstring of # the function register_vessel_status_monitor. monitordict['callback'](vesselhandle, old_data['status'], new_data['status']) except Exception: _debug_print( "Exception occurred in vessel status change callback:") _debug_print(traceback.format_exc()) # In order to prevent repeating failures, we remove the vesselhandle # from the monitor's list if the status indicates a positive response. # This means that scripts should occasionally add their known active # vessels to the monitor to prevent temporary failures from causing the # vessel to be subsequently ignored forever. if status in experimentlib.VESSEL_STATUS_SET_INACTIVE: _monitor_lock.acquire() try: monitordict['vesselhandle_list'].remove(vesselhandle) # We don't "del monitordict['vessels'][vesselhandle]" because it # doesn't hurt anything to leave it other than taking up a bit of # space, and it feels safer to leave it there just in case, for # example, this code got changed to put the "remove" call in the # try block above when access to the vessel's lock is still needed. finally: _monitor_lock.release() except Exception: _debug_print(traceback.format_exc())