def main(): # Initialize the main database. maindb.init_maindb() # Initialize the key database. keydb.init_keydb() # Initialize the nodemanager. nodemanager.init_nodemanager() # Start the background thread that does vessel cleanup. thread.start_new_thread(cleanup_vessels, ()) # Start the background thread that does vessel user key synchronization. thread.start_new_thread(sync_user_keys_of_vessels, ()) # Register the XMLRPCServer. Use allow_none to allow allow the python None value. server = ThreadedXMLRPCServer(("127.0.0.1", LISTENPORT), allow_none=True) log.info("Backend listening on port " + str(LISTENPORT) + ".") server.register_instance(BackendPublicFunctions()) while True: server.handle_request()
def _release_individual_lock(session_id, locktype, lockname): """ <Purpose> This is called by do_release_locks for each lock to be released. This will mark the lock as not being held by the specified session and will take care of giving released locks to queued requests. <Arguments> session_id: The string that is the session id under which the locks should be released. locktype: The locktype of lock, either 'user' or 'node'. lockname: The lockname of the lock (a string). <Exceptions> None. <Side Effects> Modifies the global heldlockdict and the global sessiondict to indicate that the specified lock is no longer held by the session as well as to grant the lock to the next session in the lock's queue which is waiting for it, if any. If there was a session queued for this lock, after giving the lock to that session this function will check if the new lock holder is waiting on any more locks. If not, the server thread for the queued request will be unblocked. <Returns> None. """ heldlockinfo = heldlockdict[locktype][lockname] # Regardless of whether there are queued sessions waiting for this lock, # it is removed from the list of locks this session holds. sessiondict[session_id]["heldlocks"][locktype].remove(lockname) # Remove this lock from the locktimelist. for locktimeitem in locktimelist: if locktimeitem[0] == {locktype: lockname}: log.info("Lock " + str({locktype: lockname}) + " was held for " + str(datetime.datetime.now() - locktimeitem[1])) locktimelist.remove(locktimeitem) break if len(heldlockinfo["queue"]) > 0: # Set the lock as held by the next queued session_id. new_lock_holder = heldlockinfo["queue"].pop(0) heldlockinfo["locked_by_session"] = new_lock_holder # Update the sessiondict to change this lock from a needed lock to a held lock. sessiondict[new_lock_holder]["heldlocks"][locktype].append(lockname) sessiondict[new_lock_holder]["neededlocks"][locktype].remove(lockname) # Append to the locktimelist indicating when this lock was acquired. locktimelist.append(({locktype: lockname}, datetime.datetime.now())) # If the session now holding the lock isn't waiting on any more locks, # unblock the session's current AcquireLocks request thread. if _is_lockdict_empty(sessiondict[new_lock_holder]["neededlocks"]): sessiondict[new_lock_holder]["acquirelocksproceedevent"].set() else: # There are no sessions waiting on this lock, so the lock is now held by nobody. heldlockinfo["locked_by_session"] = None
def _cleanup_single_vessel(vessel): """ This function is passed by cleanup_vessels() as the function argument to run_parallelized(). """ # This does seem wasteful of lockserver communication to require four # round-trips with the lockserver (get handle, lock, unlock, release handle), # but if we really want to fix that then I think the best thing to do would # be to allow obtaining a lockhandle and releasing a lockhandle to be done # in the same calls as lock acquisition and release. node_id = maindb.get_node_identifier_from_vessel(vessel) lockserver_handle = lockserver.create_lockserver_handle() # Lock the node that the vessels is on. lockserver.lock_node(lockserver_handle, node_id) try: # Get a new vessel object from the db in case it was modified in the db # before the lock was obtained. vessel = maindb.get_vessel(node_id, vessel.name) # Now that we have a lock on the node that this vessel is on, find out # if we should still clean up this vessel (e.g. maybe a node state # transition script moved the node to a new state and this vessel was # removed). needscleanup, reasonwhynot = maindb.does_vessel_need_cleanup(vessel) if not needscleanup: log.info("[_cleanup_single_vessel] Vessel " + str(vessel) + " no longer needs cleanup: " + reasonwhynot) return nodeid = maindb.get_node_identifier_from_vessel(vessel) nodehandle = _get_node_handle_from_nodeid(nodeid) try: log.info("[_cleanup_single_vessel] About to ChangeUsers on vessel " + str(vessel)) nodemanager.change_users(nodehandle, vessel.name, ['']) log.info("[_cleanup_single_vessel] About to ResetVessel on vessel " + str(vessel)) nodemanager.reset_vessel(nodehandle, vessel.name) except NodemanagerCommunicationError: # We don't pass this exception up. Maybe the node is offline now. At some # point, it will be marked in the database as offline (should we be doing # that here?). At that time, the dirty vessels on that node will not be # in the cleanup list anymore. log.info("[_cleanup_single_vessel] Failed to cleanup vessel " + str(vessel) + ". " + traceback.format_exc()) return # We only mark it as clean if no exception was raised when trying to # perform the above nodemanager operations. maindb.mark_vessel_as_clean(vessel) log.info("[_cleanup_single_vessel] Successfully cleaned up vessel " + str(vessel)) finally: # Unlock the node. lockserver.unlock_node(lockserver_handle, node_id) lockserver.destroy_lockserver_handle(lockserver_handle)
def acquire_specific_vessels_best_effort(lockserver_handle, geniuser, vessel_list): """ <Purpose> Acquire for geniuser as many vessels in vessel_list as possible. <Arguments> lockserver_handle The lockserver handle to be used for obtaining node locks. geniuser The GeniUser the vessels should be acquired for. vessel_list The vessels to attempt to acquire for geniuser. <Exceptions> None <Side Effects> Zero or more of the vessels are acquired for the user. The database has been updated to reflect the acquisition. <Returns> A list of the vessels that were acquired. """ acquired_vessels = [] parallel_results = _parallel_process_vessels_from_list( vessel_list, _do_acquire_vessel, lockserver_handle, geniuser) # The "exception" key contains a list of tuples where the first item of # the tuple is the vessel object and the second item is the str(e) of # the exception. Because the repy parellelization module that is used # underneath only passes up the exception string, we have made # _do_acquire_vessel() include the string "UnableToAcquireResourcesError" # in the exception message so we can tell these apart from more # serious failures (e.g the backed is down). for (vessel, exception_message) in parallel_results["exception"]: if "UnableToAcquireResourcesError" in exception_message: # This is ok, maybe the node is offline. log.info("Failed to acquire vessel: " + str(vessel)) elif "UnableToAcquireResourcesError" not in exception_message: # Something serious happened, maybe the backend is down. raise InternalError( "Unexpected exception occurred during parallelized " + "acquisition of vessels: " + exception_message) # The "returned" key contains a list of tuples where the first item of # the tuple is the vessel object and the second is the return value # (which is None). for (ignored_argument_vessel, returned_vessel) in parallel_results["returned"]: # We successfully acquired this vessel. # Append the returned vessel from _do_acquire_vessel() rather than # the argument that the parallelize.repy library used. Somewhere # along the way a copy of the argument_vessel is being made so it # doesn't reflect changes made to it. acquired_vessels.append(returned_vessel) return acquired_vessels
def _report_node_problem(node, message): global nodes_with_problems if not node.node_identifier in nodes_with_problems: nodes_with_problems[node.node_identifier] = [] nodes_with_problems[node.node_identifier].append(message) log.info("Problem on node " + str(node) + ": " + message)
def _record_node_communication_failure(readonly, node): if node not in actionstaken["recorded_communication_failure"]: actionstaken["recorded_communication_failure"].append(node) if readonly: log.info(str(node) + " Not recording communication failure because called in readonly mode.") else: log.info(str(node) + " Recording communication failure.") maindb.record_node_communication_failure(node)
def _mark_node_inactive(readonly, node): if node not in actionstaken["node_marked_inactive"]: actionstaken["node_marked_inactive"].append(node) if readonly: log.info(str(node) + " Not marking node as inactive because called in readonly mode.") else: log.info(str(node) + " Marking node as inactive.") maindb.mark_node_as_inactive(node)
def _release_vessel(readonly, vessel): if vessel not in actionstaken["vessel_released"]: actionstaken["vessel_released"].append(vessel) if readonly: log.info(str(vessel) + " Not recording vessel as released because called in readonly mode.") else: log.info(str(vessel) + " Recording vessel as released.") maindb.record_released_vessel(vessel)
def sync_user_keys_of_vessels(): """ This function is started as separate thread. It continually checks whether there are vessels needing their user keys sync'd and initiates the user key sync as needed. """ log.info("[sync_user_keys_of_vessels] thread started.") # Run forever. while True: try: # Sleep a few seconds for those times where we don't have any vessels to clean up. time.sleep(5) # We shouldn't be running the backend in production with # settings.DEBUG = True. Just in case, though, tell django to reset its # list of saved queries each time through the loop. if settings.DEBUG: django.db.reset_queries() # Get a list of vessels that need to have user keys sync'd. This doesn't # include nodes known to be inactive as we would just continue failing to # communicate with nodes that are down. vessellist = maindb.get_vessels_needing_user_key_sync() if len(vessellist) == 0: continue log.info("[sync_user_keys_of_vessels] " + str(len(vessellist)) + " vessels to have user keys sync'd: " + str(vessellist)) parallel_results = parallel.run_parallelized( vessellist, _sync_user_keys_of_single_vessel) if len(parallel_results["exception"]) > 0: for vessel, exception_message in parallel_results["exception"]: log_message = "Unhandled exception during parallelized vessel user key sync: " + exception_message log.critical(log_message) # Raise the last exceptions so that the admin gets an email. raise InternalError(log_message) except: message = "[sync_user_keys_of_vessels] Something very bad happened: " + traceback.format_exc( ) log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni backend error" django.core.mail.mail_admins(subject, message) # Sleep for ten minutes to make sure we don't flood the admins with error # report emails. time.sleep(600)
def monitor_held_lock_times(): """ Periodically checks whether there are locks that have been held too long. When there is a lock that has been held too long, logs it and also sends an email if settings.DEBUG is False. This function gets started in its own thread. """ log.info("[monitor_held_lock_times] thread started.") # Run forever. while True: try: # Wait a bit between checks. time.sleep(SECONDS_BETWEEN_LOCK_HOLDING_TIME_CHECKS) # Grab the datalock and get the oldest held lock, if there are any. datalock.acquire() try: if len(locktimelist) == 0: # No locks are held. continue oldestlock = locktimelist[0] finally: datalock.release() held_timedelta = datetime.datetime.now() - oldestlock[1] # Check if the oldest lock has been held too long. if held_timedelta > MAX_EXPECTED_LOCK_HOLDING_TIMEDELTA: message = "Lockserver lock " + str(oldestlock[0]) message += " has been held since " + str(oldestlock[1]) message += " (timedelta: " + str(held_timedelta) + ")" # Raise an exception which will cause an email to be sent from the # except clause below. raise InternalError(message) # Catch all exceptions so that the monitor thread will never die. except: message = "[monitor_held_lock_times] Something very bad happened: " + traceback.format_exc( ) log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni lockserver error" django.core.mail.mail_admins(subject, message) # Sleep for 30 minutes to make sure we don't flood the admins with error # report emails. time.sleep(60 * 30)
def _do_renew_vessel(vessel, geniuser): if vessel.acquired_by_user != geniuser: # The vessel was either already released, someone is trying to do things # they shouldn't, or we have a bug. log.info("Not renewing vessel " + str(vessel) + " because it is not acquired by user " + str(geniuser)) return maindb.set_maximum_vessel_expiration(vessel)
def monitor_held_lock_times(): """ Periodically checks whether there are locks that have been held too long. When there is a lock that has been held too long, logs it and also sends an email if settings.DEBUG is False. This function gets started in its own thread. """ log.info("[monitor_held_lock_times] thread started.") # Run forever. while True: try: # Wait a bit between checks. time.sleep(SECONDS_BETWEEN_LOCK_HOLDING_TIME_CHECKS) # Grab the datalock and get the oldest held lock, if there are any. datalock.acquire() try: if len(locktimelist) == 0: # No locks are held. continue oldestlock = locktimelist[0] finally: datalock.release() held_timedelta = datetime.datetime.now() - oldestlock[1] # Check if the oldest lock has been held too long. if held_timedelta > MAX_EXPECTED_LOCK_HOLDING_TIMEDELTA: message = "Lockserver lock " + str(oldestlock[0]) message += " has been held since " + str(oldestlock[1]) message += " (timedelta: " + str(held_timedelta) + ")" # Raise an exception which will cause an email to be sent from the # except clause below. raise InternalError(message) # Catch all exceptions so that the monitor thread will never die. except: message = "[monitor_held_lock_times] Something very bad happened: " + traceback.format_exc() log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni lockserver error" django.core.mail.mail_admins(subject, message) # Sleep for 30 minutes to make sure we don't flood the admins with error # report emails. time.sleep(60 * 30)
def acquire_specific_vessels_best_effort(lockserver_handle, geniuser, vessel_list): """ <Purpose> Acquire for geniuser as many vessels in vessel_list as possible. <Arguments> lockserver_handle The lockserver handle to be used for obtaining node locks. geniuser The GeniUser the vessels should be acquired for. vessel_list The vessels to attempt to acquire for geniuser. <Exceptions> None <Side Effects> Zero or more of the vessels are acquired for the user. The database has been updated to reflect the acquisition. <Returns> A list of the vessels that were acquired. """ acquired_vessels = [] parallel_results = _parallel_process_vessels_from_list(vessel_list, _do_acquire_vessel, lockserver_handle, geniuser) # The "exception" key contains a list of tuples where the first item of # the tuple is the vessel object and the second item is the str(e) of # the exception. Because the repy parellelization module that is used # underneath only passes up the exception string, we have made # _do_acquire_vessel() include the string "UnableToAcquireResourcesError" # in the exception message so we can tell these apart from more # serious failures (e.g the backed is down). for (vessel, exception_message) in parallel_results["exception"]: if "UnableToAcquireResourcesError" in exception_message: # This is ok, maybe the node is offline. log.info("Failed to acquire vessel: " + str(vessel)) elif "UnableToAcquireResourcesError" not in exception_message: # Something serious happened, maybe the backend is down. raise InternalError("Unexpected exception occurred during parallelized " + "acquisition of vessels: " + exception_message) # The "returned" key contains a list of tuples where the first item of # the tuple is the vessel object and the second is the return value # (which is None). for (ignored_argument_vessel, returned_vessel) in parallel_results["returned"]: # We successfully acquired this vessel. # Append the returned vessel from _do_acquire_vessel() rather than # the argument that the parallelize.r2py library used. Somewhere # along the way a copy of the argument_vessel is being made so it # doesn't reflect changes made to it. acquired_vessels.append(returned_vessel) return acquired_vessels
def _mark_node_inactive(readonly, node): if node not in actionstaken["node_marked_inactive"]: actionstaken["node_marked_inactive"].append(node) if readonly: log.info( str(node) + " Not marking node as inactive because called in readonly mode.") else: log.info(str(node) + " Marking node as inactive.") maindb.mark_node_as_inactive(node)
def sync_user_keys_of_vessels(): """ This function is started as separate thread. It continually checks whether there are vessels needing their user keys sync'd and initiates the user key sync as needed. """ log.info("[sync_user_keys_of_vessels] thread started.") # Run forever. while True: try: # Sleep a few seconds for those times where we don't have any vessels to clean up. time.sleep(5) # We shouldn't be running the backend in production with # settings.DEBUG = True. Just in case, though, tell django to reset its # list of saved queries each time through the loop. if settings.DEBUG: django.db.reset_queries() # Get a list of vessels that need to have user keys sync'd. This doesn't # include nodes known to be inactive as we would just continue failing to # communicate with nodes that are down. vessellist = maindb.get_vessels_needing_user_key_sync() if len(vessellist) == 0: continue log.info("[sync_user_keys_of_vessels] " + str(len(vessellist)) + " vessels to have user keys sync'd: " + str(vessellist)) parallel_results = parallel.run_parallelized(vessellist, _sync_user_keys_of_single_vessel) if len(parallel_results["exception"]) > 0: for vessel, exception_message in parallel_results["exception"]: log_message = "Unhandled exception during parallelized vessel user key sync: " + exception_message log.critical(log_message) # Raise the last exceptions so that the admin gets an email. raise InternalError(log_message) except: message = "[sync_user_keys_of_vessels] Something very bad happened: " + traceback.format_exc() log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni backend error" django.core.mail.mail_admins(subject, message) # Sleep for ten minutes to make sure we don't flood the admins with error # report emails. time.sleep(600)
def _release_vessel(readonly, vessel): if vessel not in actionstaken["vessel_released"]: actionstaken["vessel_released"].append(vessel) if readonly: log.info( str(vessel) + " Not recording vessel as released because called in readonly mode." ) else: log.info(str(vessel) + " Recording vessel as released.") maindb.record_released_vessel(vessel)
def _record_node_communication_failure(readonly, node): if node not in actionstaken["recorded_communication_failure"]: actionstaken["recorded_communication_failure"].append(node) if readonly: log.info( str(node) + " Not recording communication failure because called in readonly mode." ) else: log.info(str(node) + " Recording communication failure.") maindb.record_node_communication_failure(node)
def AcquireLocks(*args): """ This is a public function of the XMLRPC server. See the module comments at the top of the file for a description of how it is used. """ _assert_number_of_arguments('AcquireLocks', args, 2) (session_id, request_acquire_lockdict) = args datalock.acquire() try: # Ensure it's a string before printing it like one. _assert_valid_session(session_id) log.info("[session_id: " + session_id + "] AcquireLocks called for locks " + str(request_acquire_lockdict)) # Check if this session has an outstanding AcquireLocks request. Clients # should not be making concurrent AcquireLocks requests. if sessiondict[session_id]["acquirelocksinprogress"]: message = "[session_id: " + session_id + "] AcquireLocks called while an earlier AcquireLocks call has not been completed." raise LockserverInvalidRequestError(message) do_acquire_locks(session_id, request_acquire_lockdict) # Indicate that there is a running AcquireLocks request for this session # so that future AcquireLocks requests will be denied until this one is # fulfilled. If the call to do_acquire_locks raised an exception, this # will not get set. sessiondict[session_id]["acquirelocksinprogress"] = True finally: datalock.release() # Wait for our event flag to signal that we have acquired the locks. # This is what causes the request thread to block until it is fulfilled. # If the event is not set at this point (causing this thread to block), # then it will be set by calls to ReleaseLocks. If a call to ReleaseLocks # signals this event between the time we released the global datalock and # when we get to this wait() line, that's fine. sessiondict[session_id]["acquirelocksproceedevent"].wait() # Indicate that we've made it past our wait() call, meaning that future # calls to do_acquire_locks() can be allowed again. We do not need to hold # the global datalock to set this as it can only be set to True in the # critical section above which first ensures that the value is not already # true. sessiondict[session_id]["acquirelocksinprogress"] = False log.info("[session_id: " + session_id + "] AcquireLocks fulfilled request for locks " + str(request_acquire_lockdict))
def GetStatus(*args): """ This is a public function of the XMLRPC server. See the module comments at the top of the file for a description of how it is used. """ _assert_number_of_arguments('GetStatus', args, 0) datalock.acquire() try: log.info("GetStatus called.") return do_get_status() finally: datalock.release()
def StartSession(*args): """ This is a public function of the XMLRPC server. See the module comments at the top of the file for a description of how it is used. """ _assert_number_of_arguments('StartSession', args, 0) datalock.acquire() try: session_id = do_start_session() log.info("[session_id: " + session_id + "] StartSession called.") return session_id finally: datalock.release()
def ReleaseLocks(*args): """ This is a public function of the XMLRPC server. See the module comments at the top of the file for a description of how it is used. """ _assert_number_of_arguments('ReleaseLocks', args, 2) (session_id, request_release_lockdict) = args datalock.acquire() try: # Ensure it's a string before printing it like one. _assert_valid_session(session_id) log.info("[session_id: " + session_id + "] ReleaseLocks called for locks " + str(request_release_lockdict)) do_release_locks(session_id, request_release_lockdict) finally: datalock.release()
def EndSession(*args): """ This is a public function of the XMLRPC server. See the module comments at the top of the file for a description of how it is used. """ _assert_number_of_arguments('EndSession', args, 1) # avoid python magic comma needed to write this as "(session_id,) = args" session_id = args[0] datalock.acquire() try: # Ensure it's a string before printing it like one. _assert_valid_session(session_id) log.info("[session_id: " + session_id + "] EndSession called.") do_end_session(session_id) finally: datalock.release()
def main(): # Initialize global variables. init_globals() # Register the XMLRPCServer. Use allow_none to allow allow the python None value. server = ThreadedXMLRPCServer(("127.0.0.1", LISTENPORT), allow_none=True) log.info("Listening on port " + str(LISTENPORT) + ".") # Start the background thread that watches for locks being held too long. thread.start_new_thread(monitor_held_lock_times, ()) server.register_instance(LockserverPublicFunctions()) while True: server.handle_request() # Shutdown the lockserver if there was an internal error. # This doesn't actually get detected until another request has been # made, as the main server thread is often already blocked in the # next handle_request() call when this this value get set. if lockserver_had_error: sys.exit(1)
def _do_release_vessel(vessel, geniuser): """ Obtains a lock on the node the vessel is on and then makes a call to the backend to release the vessel. """ if vessel.acquired_by_user != geniuser: # The vessel was either already released, someone is trying to do things # they shouldn't, or we have a bug. log.info("Not releasing vessel " + str(vessel) + " because it is not acquired by user " + str(geniuser)) return # We don't check for node.is_active == True because we might as well have # the backend try to clean up the vessel even if the database says it's # inactive (maybe the node is back online?). # This will not raise an exception, even if the node the vessel is on is down. backend.release_vessel(vessel) # Update the database to reflect the release of the vessel. maindb.record_released_vessel(vessel)
def _mark_node_broken(readonly, node): if node not in actionstaken["node_marked_broken"]: actionstaken["node_marked_broken"].append(node) if node.is_broken: log.info(str(node) + " Not marking node as broken because it is already broken.") else: if readonly: log.info(str(node) + " Not marking node as broken because called in readonly mode.") else: log.info(str(node) + " Marking node as broken.") maindb.mark_node_as_broken(node)
def _mark_node_broken(readonly, node): if node not in actionstaken["node_marked_broken"]: actionstaken["node_marked_broken"].append(node) if node.is_broken: log.info( str(node) + " Not marking node as broken because it is already broken.") else: if readonly: log.info( str(node) + " Not marking node as broken because called in readonly mode.") else: log.info(str(node) + " Marking node as broken.") maindb.mark_node_as_broken(node)
def cleanup_vessels(): """ This function is started as separate thread. It continually checks whether there are vessels needing to be cleaned up and initiates cleanup as needed. """ log.info("[cleanup_vessels] cleanup thread started.") # Start a transaction management. django.db.transaction.enter_transaction_management() # Run forever. while True: try: # Sleep a few seconds for those times where we don't have any vessels to clean up. time.sleep(5) # We shouldn't be running the backend in production with # settings.DEBUG = True. Just in case, though, tell django to reset its # list of saved queries each time through the loop. Note that this is not # specific to the cleanup thread as other parts of the backend are using # the maindb, as well, so we're overloading the purpose of the cleanup # thread by doing this here. This is just a convenient place to do it. # See http://docs.djangoproject.com/en/dev/faq/models/#why-is-django-leaking-memory # for more info. if settings.DEBUG: django.db.reset_queries() # First, make it so that expired vessels are seen as dirty. We aren't # holding a lock on the nodes when we do this. It's possible that we do # this while someone else has a lock on the node. What would result? # I believe the worst result is that a user has their vessel marked as # dirty after they renewed in the case where they are renewing it just # as it expires (with some exceptionally bad timing involved). And, # that's not really very bad as if the user is trying to renew at the # exact moment it expires, their trying their luck with how fast their # request gets processed, anyways. In short, I don't think it's important # enough to either obtain locks to do this or to rewrite the code to # avoid any need for separately marking expired vessels as dirty rather # than just trying to process expired vessels directly in the code below. date_started = datetime.datetime.now() expired_list = maindb.mark_expired_vessels_as_dirty() if len(expired_list) > 0: log.info("[cleanup_vessels] " + str(len(expired_list)) + " expired vessels have been marked as dirty: " + str(expired_list)) maindb.create_action_log_event("mark_expired_vessels_as_dirty", user=None, second_arg=None, third_arg=None, was_successful=True, message=None, date_started=date_started, vessel_list=expired_list) # Get a list of vessels to clean up. This doesn't include nodes known to # be inactive as we would just continue failing to communicate with nodes # that are down. cleanupvessellist = maindb.get_vessels_needing_cleanup() if len(cleanupvessellist) == 0: continue log.info("[cleanup_vessels] " + str(len(cleanupvessellist)) + " vessels to clean up: " + str(cleanupvessellist)) parallel_results = parallel.run_parallelized( cleanupvessellist, _cleanup_single_vessel) if len(parallel_results["exception"]) > 0: for vessel, exception_message in parallel_results["exception"]: log_message = "Unhandled exception during parallelized vessel cleanup: " + exception_message log.critical(log_message) # Raise the last exceptions so that the admin gets an email. raise InternalError(log_message) except: message = "[cleanup_vessels] Something very bad happened: " + traceback.format_exc( ) log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni backend error" django.core.mail.mail_admins(subject, message) # Sleep for ten minutes to make sure we don't flood the admins with error # report emails. time.sleep(600) finally: # Manually commit the transaction to prevent caching. django.db.transaction.commit()
def _acquire_vessels_from_list(lockserver_handle, geniuser, vesselcount, vessel_list): """ This function will try to acquire vesselcount vessels from vessel_list. If less than vesselcount can be acquired, then the partial set of vessels that were acquired will be released by this function before it returns. Returns the list of acquired vessels if successful. """ # Make sure there are sufficient vessels to even try to fulfill the request. if len(vessel_list) < vesselcount: raise UnableToAcquireResourcesError("There are not enough available vessels to fulfill the request.") acquired_vessels = [] remaining_vessel_list = vessel_list[:] # Keep trying to acquire vessels until there are no more left to acquire. # There's a "return" statement in the loop that will get out of the loop # once we've obtained all of the vessels we wanted, so here we are only # concerned with there being any vessels left to try. while len(remaining_vessel_list) > 0: # Each time through the loop we'll try to acquire the number of vessels # remaining that are needed to fulfill the user's request. remaining_needed_vesselcount = vesselcount - len(acquired_vessels) next_vessels_to_acquire = remaining_vessel_list[:remaining_needed_vesselcount] remaining_vessel_list = remaining_vessel_list[remaining_needed_vesselcount:] # Note that we haven't worried about checking if the number of remaining # vessels could still fulfill the user's request. In the name of # correctness over efficiency, we'll let this case that should be rare # (at least until the vesselcount's users are request get to be huge) # sort itself out with a few unnecessary vessel acquisition before they # ultimately get released after this loop. parallel_results = _parallel_process_vessels_from_list(next_vessels_to_acquire, _do_acquire_vessel, lockserver_handle, geniuser) # The "exception" key contains a list of tuples where the first item of # the tuple is the vessel object and the second item is the str(e) of # the exception. Because the repy parellelization module that is used # underneath only passes up the exception string, we have made # _do_acquire_vessel() include the string "UnableToAcquireResourcesError" # in the exception message so we can tell these apart from more # serious failures (e.g the backed is down). for (vessel, exception_message) in parallel_results["exception"]: if "UnableToAcquireResourcesError" in exception_message: # This is ok, maybe the node is offline. log.info("Failed to acquire vessel: " + str(vessel)) elif "UnableToAcquireResourcesError" not in exception_message: # Something serious happened, maybe the backend is down. raise InternalError("Unexpected exception occurred during parallelized " + "acquisition of vessels: " + exception_message) # The "returned" key contains a list of tuples where the first item of # the tuple is the vessel object and the second is the return value # (which is None). for (ignored_argument_vessel, returned_vessel) in parallel_results["returned"]: # We successfully acquired this vessel. # Append the returned vessel from _do_acquire_vessel() rather than # the argument that the parallelize.r2py library used. Somewhere # along the way a copy of the argument_vessel is being made so it # doesn't reflect changes made to it. acquired_vessels.append(returned_vessel) # If we've acquired all of the vessels the user wanted, we're done. if len(acquired_vessels) == vesselcount: log.info("Successfully acquired vessel: " + str(returned_vessel)) return acquired_vessels # If we got here, then we didn't acquire the vessels the user wanted. We # release any vessels that may have been acquired rather than leave the user # with a partial set of what they requested. if acquired_vessels: release_vessels(lockserver_handle, geniuser, acquired_vessels) raise UnableToAcquireResourcesError("Failed to acquire enough vessels to fulfill the request")
def cleanup_vessels(): """ This function is started as separate thread. It continually checks whether there are vessels needing to be cleaned up and initiates cleanup as needed. """ log.info("[cleanup_vessels] cleanup thread started.") # Start a transaction management. django.db.transaction.enter_transaction_management() # Run forever. while True: try: # Sleep a few seconds for those times where we don't have any vessels to clean up. time.sleep(5) # We shouldn't be running the backend in production with # settings.DEBUG = True. Just in case, though, tell django to reset its # list of saved queries each time through the loop. Note that this is not # specific to the cleanup thread as other parts of the backend are using # the maindb, as well, so we're overloading the purpose of the cleanup # thread by doing this here. This is just a convenient place to do it. # See http://docs.djangoproject.com/en/dev/faq/models/#why-is-django-leaking-memory # for more info. if settings.DEBUG: django.db.reset_queries() # First, make it so that expired vessels are seen as dirty. We aren't # holding a lock on the nodes when we do this. It's possible that we do # this while someone else has a lock on the node. What would result? # I believe the worst result is that a user has their vessel marked as # dirty after they renewed in the case where they are renewing it just # as it expires (with some exceptionally bad timing involved). And, # that's not really very bad as if the user is trying to renew at the # exact moment it expires, their trying their luck with how fast their # request gets processed, anyways. In short, I don't think it's important # enough to either obtain locks to do this or to rewrite the code to # avoid any need for separately marking expired vessels as dirty rather # than just trying to process expired vessels directly in the code below. date_started=datetime.datetime.now() expired_list = maindb.mark_expired_vessels_as_dirty() if len(expired_list) > 0: log.info("[cleanup_vessels] " + str(len(expired_list)) + " expired vessels have been marked as dirty: " + str(expired_list)) maindb.create_action_log_event("mark_expired_vessels_as_dirty", user=None, second_arg=None, third_arg=None, was_successful=True, message=None, date_started=date_started, vessel_list=expired_list) # Get a list of vessels to clean up. This doesn't include nodes known to # be inactive as we would just continue failing to communicate with nodes # that are down. cleanupvessellist = maindb.get_vessels_needing_cleanup() if len(cleanupvessellist) == 0: continue log.info("[cleanup_vessels] " + str(len(cleanupvessellist)) + " vessels to clean up: " + str(cleanupvessellist)) parallel_results = parallel.run_parallelized(cleanupvessellist, _cleanup_single_vessel) if len(parallel_results["exception"]) > 0: for vessel, exception_message in parallel_results["exception"]: log_message = "Unhandled exception during parallelized vessel cleanup: " + exception_message log.critical(log_message) # Raise the last exceptions so that the admin gets an email. raise InternalError(log_message) except: message = "[cleanup_vessels] Something very bad happened: " + traceback.format_exc() log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni backend error" django.core.mail.mail_admins(subject, message) # Sleep for ten minutes to make sure we don't flood the admins with error # report emails. time.sleep(600) finally: # Manually commit the transaction to prevent caching. django.db.transaction.commit()
keydb.init_keydb() # Initialize the nodemanager. nodemanager.init_nodemanager() # Start the background thread that does vessel cleanup. thread.start_new_thread(cleanup_vessels, ()) # Start the background thread that does vessel user key synchronization. thread.start_new_thread(sync_user_keys_of_vessels, ()) # Register the XMLRPCServer. Use allow_none to allow allow the python None value. server = ThreadedXMLRPCServer(("127.0.0.1", LISTENPORT), allow_none=True) log.info("Backend listening on port " + str(LISTENPORT) + ".") server.register_instance(BackendPublicFunctions()) while True: server.handle_request() if __name__ == '__main__': try: main() except KeyboardInterrupt: log.info("Exiting on KeyboardInterrupt.") sys.exit(0)
def _sync_user_keys_of_single_vessel(vessel): """ This function is passed by sync_user_keys_of_vessels() as the function argument to run_parallelized(). """ # This does seem wasteful of lockserver communication to require four # round-trips with the lockserver (get handle, lock, unlock, release handle), # but if we really want to fix that then I think the best thing to do would # be to allow obtaining a lockhandle and releasing a lockhandle to be done # in the same calls as lock acquisition and release. node_id = maindb.get_node_identifier_from_vessel(vessel) lockserver_handle = lockserver.create_lockserver_handle() # Lock the node that the vessels is on. lockserver.lock_node(lockserver_handle, node_id) try: # Get a new vessel object from the db in case it was modified in the db # before the lock was obtained. vessel = maindb.get_vessel(node_id, vessel.name) # Now that we have a lock on the node that this vessel is on, find out # if we should still sync user keys on this vessel (e.g. maybe a node state # transition script moved the node to a new state and this vessel was # removed). needssync, reasonwhynot = maindb.does_vessel_need_user_key_sync(vessel) if not needssync: log.info("[_sync_user_keys_of_single_vessel] Vessel " + str(vessel) + " no longer needs user key sync: " + reasonwhynot) return nodeid = maindb.get_node_identifier_from_vessel(vessel) nodehandle = _get_node_handle_from_nodeid(nodeid) # The list returned from get_users_with_access_to_vessel includes the key of # the user who has acquired the vessel along with any other users they have # given access to. user_list = maindb.get_users_with_access_to_vessel(vessel) key_list = [] for user in user_list: key_list.append(user.user_pubkey) if len(key_list) == 0: raise InternalError("InternalError: Empty user key list for vessel " + str(vessel)) try: log.info("[_sync_user_keys_of_single_vessel] About to ChangeUsers on vessel " + str(vessel)) nodemanager.change_users(nodehandle, vessel.name, key_list) except NodemanagerCommunicationError: # We don't pass this exception up. Maybe the node is offline now. At some # point, it will be marked in the database as offline and won't show up in # our list of vessels to sync user keys of anymore. log.info("[_sync_user_keys_of_single_vessel] Failed to sync user keys of vessel " + str(vessel) + ". " + traceback.format_exc()) return # We only mark it as sync'd if no exception was raised when trying to perform # the above nodemanager operations. maindb.mark_vessel_as_not_needing_user_key_sync(vessel) log.info("[_sync_user_keys_of_single_vessel] Successfully sync'd user keys of vessel " + str(vessel)) finally: # Unlock the node. lockserver.unlock_node(lockserver_handle, node_id) lockserver.destroy_lockserver_handle(lockserver_handle)
def main(): """ This will run an infinite loop of checks over all of the active nodes in the database. """ lockserver_handle = lockserver.create_lockserver_handle() # Always try to release the lockserver handle, though it's probably not # very useful in this case. try: while True: # Catch unexpected exceptions to log/send mail. try: # We shouldn't be running in production with settings.DEBUG = True. # Just in case, though, tell django to reset its list of saved queries # each time through the loop. if settings.DEBUG: django.db.reset_queries() # Note: although we include broken but active nodes, we don't change # the status of broken nodes to be not broken yet if we don't detect # any problems. For now, most of the reason we include broken nodes # is so that we can tell which broken nodes are still online. This is # because it's not as big of a concern to have a broken node that is # quickly offline (e.g. broken nodes in development), but having one be # online for an extended period of time is a stronger signal of # potentially unknown bugs in the seattlegeni or seattle code. active_nodes = maindb.get_active_nodes_include_broken() log.info("Starting check of " + str(len(active_nodes)) + " active nodes.") checked_node_count = 0 for node in active_nodes: checked_node_count += 1 log.info("Checking node " + str(checked_node_count) + ": " + str(node)) nodestatus.check_node(node, readonly=READONLY, lockserver_handle=lockserver_handle) # Print summary info. log.info("Nodes checked: " + str(checked_node_count)) nodes_with_problems = nodestatus.get_node_problem_info() nodes_with_problems_count = len(nodes_with_problems.keys()) log.info("Nodes without problems: " + str(checked_node_count - nodes_with_problems_count)) log.info("Nodes with problems: " + str(nodes_with_problems_count)) # Print information about the database changes made. log.info("Number of database actions taken:") actionstaken = nodestatus.get_actions_taken() for actionname in actionstaken: log.info("\t" + actionname + ": " + str(len(actionstaken[actionname])) + " " + str(actionstaken[actionname])) nodestatus.reset_collected_data() log.info("Sleeping for " + str(SLEEP_SECONDS_BETWEEN_RUNS) + " seconds.") time.sleep(SLEEP_SECONDS_BETWEEN_RUNS) except KeyboardInterrupt: raise except: message = "Unexpected exception in check_active_db_nodes.py: " + traceback.format_exc( ) log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni check_active_db_nodes.py error" django.core.mail.mail_admins(subject, message) # Sleep for ten minutes to make sure we don't flood the admins with error # report emails. time.sleep(600) finally: lockserver.destroy_lockserver_handle(lockserver_handle)
def main(): # Initialize global variables. init_globals() # Register the XMLRPCServer. Use allow_none to allow allow the python None value. server = ThreadedXMLRPCServer(("127.0.0.1", LISTENPORT), allow_none=True) log.info("Listening on port " + str(LISTENPORT) + ".") # Start the background thread that watches for locks being held too long. thread.start_new_thread(monitor_held_lock_times, ()) server.register_instance(LockserverPublicFunctions()) while True: server.handle_request() # Shutdown the lockserver if there was an internal error. # This doesn't actually get detected until another request has been # made, as the main server thread is often already blocked in the # next handle_request() call when this this value get set. if lockserver_had_error: sys.exit(1) if __name__ == '__main__': try: main() except KeyboardInterrupt: log.info("Exiting on KeyboardInterrupt.") sys.exit(0)
def _cleanup_single_vessel(vessel): """ This function is passed by cleanup_vessels() as the function argument to run_parallelized(). """ # This does seem wasteful of lockserver communication to require four # round-trips with the lockserver (get handle, lock, unlock, release handle), # but if we really want to fix that then I think the best thing to do would # be to allow obtaining a lockhandle and releasing a lockhandle to be done # in the same calls as lock acquisition and release. node_id = maindb.get_node_identifier_from_vessel(vessel) lockserver_handle = lockserver.create_lockserver_handle() # Lock the node that the vessels is on. lockserver.lock_node(lockserver_handle, node_id) try: # Get a new vessel object from the db in case it was modified in the db # before the lock was obtained. vessel = maindb.get_vessel(node_id, vessel.name) # Now that we have a lock on the node that this vessel is on, find out # if we should still clean up this vessel (e.g. maybe a node state # transition script moved the node to a new state and this vessel was # removed). needscleanup, reasonwhynot = maindb.does_vessel_need_cleanup(vessel) if not needscleanup: log.info("[_cleanup_single_vessel] Vessel " + str(vessel) + " no longer needs cleanup: " + reasonwhynot) return nodeid = maindb.get_node_identifier_from_vessel(vessel) nodehandle = _get_node_handle_from_nodeid(nodeid) try: log.info( "[_cleanup_single_vessel] About to ChangeUsers on vessel " + str(vessel)) nodemanager.change_users(nodehandle, vessel.name, ['']) log.info( "[_cleanup_single_vessel] About to ResetVessel on vessel " + str(vessel)) nodemanager.reset_vessel(nodehandle, vessel.name) except NodemanagerCommunicationError: # We don't pass this exception up. Maybe the node is offline now. At some # point, it will be marked in the database as offline (should we be doing # that here?). At that time, the dirty vessels on that node will not be # in the cleanup list anymore. log.info("[_cleanup_single_vessel] Failed to cleanup vessel " + str(vessel) + ". " + traceback.format_exc()) return # We only mark it as clean if no exception was raised when trying to # perform the above nodemanager operations. maindb.mark_vessel_as_clean(vessel) log.info("[_cleanup_single_vessel] Successfully cleaned up vessel " + str(vessel)) finally: # Unlock the node. lockserver.unlock_node(lockserver_handle, node_id) lockserver.destroy_lockserver_handle(lockserver_handle)
def _sync_user_keys_of_single_vessel(vessel): """ This function is passed by sync_user_keys_of_vessels() as the function argument to run_parallelized(). """ # This does seem wasteful of lockserver communication to require four # round-trips with the lockserver (get handle, lock, unlock, release handle), # but if we really want to fix that then I think the best thing to do would # be to allow obtaining a lockhandle and releasing a lockhandle to be done # in the same calls as lock acquisition and release. node_id = maindb.get_node_identifier_from_vessel(vessel) lockserver_handle = lockserver.create_lockserver_handle() # Lock the node that the vessels is on. lockserver.lock_node(lockserver_handle, node_id) try: # Get a new vessel object from the db in case it was modified in the db # before the lock was obtained. vessel = maindb.get_vessel(node_id, vessel.name) # Now that we have a lock on the node that this vessel is on, find out # if we should still sync user keys on this vessel (e.g. maybe a node state # transition script moved the node to a new state and this vessel was # removed). needssync, reasonwhynot = maindb.does_vessel_need_user_key_sync(vessel) if not needssync: log.info("[_sync_user_keys_of_single_vessel] Vessel " + str(vessel) + " no longer needs user key sync: " + reasonwhynot) return nodeid = maindb.get_node_identifier_from_vessel(vessel) nodehandle = _get_node_handle_from_nodeid(nodeid) # The list returned from get_users_with_access_to_vessel includes the key of # the user who has acquired the vessel along with any other users they have # given access to. user_list = maindb.get_users_with_access_to_vessel(vessel) key_list = [] for user in user_list: key_list.append(user.user_pubkey) if len(key_list) == 0: raise InternalError( "InternalError: Empty user key list for vessel " + str(vessel)) try: log.info( "[_sync_user_keys_of_single_vessel] About to ChangeUsers on vessel " + str(vessel)) nodemanager.change_users(nodehandle, vessel.name, key_list) except NodemanagerCommunicationError: # We don't pass this exception up. Maybe the node is offline now. At some # point, it will be marked in the database as offline and won't show up in # our list of vessels to sync user keys of anymore. log.info( "[_sync_user_keys_of_single_vessel] Failed to sync user keys of vessel " + str(vessel) + ". " + traceback.format_exc()) return # We only mark it as sync'd if no exception was raised when trying to perform # the above nodemanager operations. maindb.mark_vessel_as_not_needing_user_key_sync(vessel) log.info( "[_sync_user_keys_of_single_vessel] Successfully sync'd user keys of vessel " + str(vessel)) finally: # Unlock the node. lockserver.unlock_node(lockserver_handle, node_id) lockserver.destroy_lockserver_handle(lockserver_handle)
def main(): """ This will run an infinite loop of checks over all of the active nodes in the database. """ lockserver_handle = lockserver.create_lockserver_handle() # Always try to release the lockserver handle, though it's probably not # very useful in this case. try: while True: # Catch unexpected exceptions to log/send mail. try: # We shouldn't be running in production with settings.DEBUG = True. # Just in case, though, tell django to reset its list of saved queries # each time through the loop. if settings.DEBUG: django.db.reset_queries() # Note: although we include broken but active nodes, we don't change # the status of broken nodes to be not broken yet if we don't detect # any problems. For now, most of the reason we include broken nodes # is so that we can tell which broken nodes are still online. This is # because it's not as big of a concern to have a broken node that is # quickly offline (e.g. broken nodes in development), but having one be # online for an extended period of time is a stronger signal of # potentially unknown bugs in the seattlegeni or seattle code. active_nodes = maindb.get_active_nodes_include_broken() log.info("Starting check of " + str(len(active_nodes)) + " active nodes.") checked_node_count = 0 for node in active_nodes: checked_node_count += 1 log.info("Checking node " + str(checked_node_count) + ": " + str(node)) nodestatus.check_node(node, readonly=READONLY, lockserver_handle=lockserver_handle) # Print summary info. log.info("Nodes checked: " + str(checked_node_count)) nodes_with_problems = nodestatus.get_node_problem_info() nodes_with_problems_count = len(nodes_with_problems.keys()) log.info("Nodes without problems: " + str(checked_node_count - nodes_with_problems_count)) log.info("Nodes with problems: " + str(nodes_with_problems_count)) # Print information about the database changes made. log.info("Number of database actions taken:") actionstaken = nodestatus.get_actions_taken() for actionname in actionstaken: log.info("\t" + actionname + ": " + str(len(actionstaken[actionname])) + " " + str(actionstaken[actionname])) nodestatus.reset_collected_data() log.info("Sleeping for " + str(SLEEP_SECONDS_BETWEEN_RUNS) + " seconds.") time.sleep(SLEEP_SECONDS_BETWEEN_RUNS) except KeyboardInterrupt: raise except: message = "Unexpected exception in check_active_db_nodes.py: " + traceback.format_exc() log.critical(message) # Send an email to the addresses listed in settings.ADMINS if not settings.DEBUG: subject = "Critical SeattleGeni check_active_db_nodes.py error" django.core.mail.mail_admins(subject, message) # Sleep for ten minutes to make sure we don't flood the admins with error # report emails. time.sleep(600) finally: lockserver.destroy_lockserver_handle(lockserver_handle)