Exemplo n.º 1
0
def generate_keypair():
    """
  <Purpose>
    Obtain a new (unused) public/private keypair.
  <Arguments>
    None
  <Exceptions>
    None
  <Side Effects>
    Requests a key from the keygen daemon if USE_KEYDAEMON is True. If that
    fails or if USE_KEYDAEMON is False, directly generates a key.
  <Returns>
    A tuple in the format (pubkeystr, privkeystr).
  """

    if USE_KEYDAEMON:
        try:
            return _generate_keypair_from_key_daemon()
        except:
            log.critical(
                "Unable to generate key from key daemon, falling back to " +
                "manual key generation. This may be very slow. The error " +
                " from the key daemon was: " + traceback.format_exc())

    return _generate_keypair_directly()
Exemplo n.º 2
0
def sync_user_keys_of_vessels():
    """
  This function is started as separate thread. It continually checks whether
  there are vessels needing their user keys sync'd and initiates the user key
  sync as needed.
  """

    log.info("[sync_user_keys_of_vessels] thread started.")

    # Run forever.
    while True:

        try:

            # Sleep a few seconds for those times where we don't have any vessels to clean up.
            time.sleep(5)

            # We shouldn't be running the backend in production with
            # settings.DEBUG = True. Just in case, though, tell django to reset its
            # list of saved queries each time through the loop.
            if settings.DEBUG:
                django.db.reset_queries()

            # Get a list of vessels that need to have user keys sync'd. This doesn't
            # include nodes known to be inactive as we would just continue failing to
            # communicate with nodes that are down.
            vessellist = maindb.get_vessels_needing_user_key_sync()
            if len(vessellist) == 0:
                continue

            log.info("[sync_user_keys_of_vessels] " + str(len(vessellist)) +
                     " vessels to have user keys sync'd: " + str(vessellist))

            parallel_results = parallel.run_parallelized(
                vessellist, _sync_user_keys_of_single_vessel)

            if len(parallel_results["exception"]) > 0:
                for vessel, exception_message in parallel_results["exception"]:
                    log_message = "Unhandled exception during parallelized vessel user key sync: " + exception_message
                    log.critical(log_message)
                # Raise the last exceptions so that the admin gets an email.
                raise InternalError(log_message)

        except:
            message = "[sync_user_keys_of_vessels] Something very bad happened: " + traceback.format_exc(
            )
            log.critical(message)

            # Send an email to the addresses listed in settings.ADMINS
            if not settings.DEBUG:
                subject = "Critical SeattleGeni backend error"
                django.core.mail.mail_admins(subject, message)

                # Sleep for ten minutes to make sure we don't flood the admins with error
                # report emails.
                time.sleep(600)
Exemplo n.º 3
0
def monitor_held_lock_times():
    """
  Periodically checks whether there are locks that have been held too long.
  When there is a lock that has been held too long, logs it and also sends
  an email if settings.DEBUG is False.
  
  This function gets started in its own thread.
  """

    log.info("[monitor_held_lock_times] thread started.")

    # Run forever.
    while True:

        try:

            # Wait a bit between checks.
            time.sleep(SECONDS_BETWEEN_LOCK_HOLDING_TIME_CHECKS)

            # Grab the datalock and get the oldest held lock, if there are any.
            datalock.acquire()
            try:
                if len(locktimelist) == 0:
                    # No locks are held.
                    continue

                oldestlock = locktimelist[0]

            finally:
                datalock.release()

            held_timedelta = datetime.datetime.now() - oldestlock[1]

            # Check if the oldest lock has been held too long.
            if held_timedelta > MAX_EXPECTED_LOCK_HOLDING_TIMEDELTA:
                message = "Lockserver lock " + str(oldestlock[0])
                message += " has been held since " + str(oldestlock[1])
                message += " (timedelta: " + str(held_timedelta) + ")"
                # Raise an exception which will cause an email to be sent from the
                # except clause below.
                raise InternalError(message)

        # Catch all exceptions so that the monitor thread will never die.
        except:
            message = "[monitor_held_lock_times] Something very bad happened: " + traceback.format_exc(
            )
            log.critical(message)

            # Send an email to the addresses listed in settings.ADMINS
            if not settings.DEBUG:
                subject = "Critical SeattleGeni lockserver error"
                django.core.mail.mail_admins(subject, message)

                # Sleep for 30 minutes to make sure we don't flood the admins with error
                # report emails.
                time.sleep(60 * 30)
Exemplo n.º 4
0
def monitor_held_lock_times():
  """
  Periodically checks whether there are locks that have been held too long.
  When there is a lock that has been held too long, logs it and also sends
  an email if settings.DEBUG is False.
  
  This function gets started in its own thread.
  """
  
  log.info("[monitor_held_lock_times] thread started.")

  # Run forever.
  while True:
    
    try:
      
      # Wait a bit between checks.
      time.sleep(SECONDS_BETWEEN_LOCK_HOLDING_TIME_CHECKS)
      
      # Grab the datalock and get the oldest held lock, if there are any.
      datalock.acquire()
      try:
        if len(locktimelist) == 0:
          # No locks are held.
          continue
        
        oldestlock = locktimelist[0]
        
      finally:
        datalock.release()
        
      held_timedelta = datetime.datetime.now() - oldestlock[1]
      
      # Check if the oldest lock has been held too long.
      if held_timedelta > MAX_EXPECTED_LOCK_HOLDING_TIMEDELTA:
        message = "Lockserver lock " + str(oldestlock[0])
        message += " has been held since " + str(oldestlock[1])
        message += " (timedelta: " + str(held_timedelta) + ")"
        # Raise an exception which will cause an email to be sent from the
        # except clause below.
        raise InternalError(message)
        
    # Catch all exceptions so that the monitor thread will never die.
    except:
      message = "[monitor_held_lock_times] Something very bad happened: " + traceback.format_exc()
      log.critical(message)
      
      # Send an email to the addresses listed in settings.ADMINS
      if not settings.DEBUG:
        subject = "Critical SeattleGeni lockserver error"
        django.core.mail.mail_admins(subject, message)
        
        # Sleep for 30 minutes to make sure we don't flood the admins with error
        # report emails.
        time.sleep(60 * 30)
Exemplo n.º 5
0
def sync_user_keys_of_vessels():
  """
  This function is started as separate thread. It continually checks whether
  there are vessels needing their user keys sync'd and initiates the user key
  sync as needed.
  """

  log.info("[sync_user_keys_of_vessels] thread started.")

  # Run forever.
  while True:
    
    try:
      
      # Sleep a few seconds for those times where we don't have any vessels to clean up.
      time.sleep(5)
      
      # We shouldn't be running the backend in production with
      # settings.DEBUG = True. Just in case, though, tell django to reset its
      # list of saved queries each time through the loop.
      if settings.DEBUG:
        django.db.reset_queries()
      
      # Get a list of vessels that need to have user keys sync'd. This doesn't
      # include nodes known to be inactive as we would just continue failing to
      # communicate with nodes that are down.
      vessellist = maindb.get_vessels_needing_user_key_sync()
      if len(vessellist) == 0:
        continue
        
      log.info("[sync_user_keys_of_vessels] " + str(len(vessellist)) + 
               " vessels to have user keys sync'd: " + str(vessellist))
     
      parallel_results = parallel.run_parallelized(vessellist, _sync_user_keys_of_single_vessel)
     
      if len(parallel_results["exception"]) > 0:
        for vessel, exception_message in parallel_results["exception"]:
          log_message = "Unhandled exception during parallelized vessel user key sync: " + exception_message
          log.critical(log_message)
        # Raise the last exceptions so that the admin gets an email.
        raise InternalError(log_message)
        
    except:
      message = "[sync_user_keys_of_vessels] Something very bad happened: " + traceback.format_exc()
      log.critical(message)
      
      # Send an email to the addresses listed in settings.ADMINS
      if not settings.DEBUG:
        subject = "Critical SeattleGeni backend error"
        django.core.mail.mail_admins(subject, message)
        
        # Sleep for ten minutes to make sure we don't flood the admins with error
        # report emails.
        time.sleep(600)
Exemplo n.º 6
0
    def _dispatch(self, method, args):
        """
    We provide a _dispatch function (which SimpleXMLRPCServer looks for and
    uses) so that we can log exceptions due to our programming errors within
    thelockserver as well to detect incorrect usage by clients. When an
    internal lockserver error is detected, this method will signal to the main
    server thread to shutdown.
    """
        global lockserver_had_error

        try:
            # Get the requested function (making sure it exists).
            try:
                func = getattr(self, method)
            except AttributeError:
                raise LockserverInvalidRequestError("The requested method '" +
                                                    method +
                                                    "' doesn't exist.")

            # Call the requested function.
            return func(*args)

        except LockserverInvalidRequestError:
            log.error("The lockserver was used incorrectly: " +
                      traceback.format_exc())
            raise

        except:
            # We assume all other exceptions are bugs in the lockserver.
            # If there is a bug in the lockserver, that's really bad. We terminate the
            # lockserver in this case rather than risk incorrect locking behavior.

            # This will tell the server thread to exit.
            lockserver_had_error = True

            message = "The lockserver had an internal error and is exiting." + traceback.format_exc(
            )
            log.critical(message)

            # Send an email to the addresses listed in settings.ADMINS
            if not settings.DEBUG:
                subject = "Critical SeattleGeni lockserver error"
                django.core.mail.mail_admins(subject, message)

            # This request will likely end up seeing an xmlrpclib.ProtocolError due to the
            # shutdown, regardless of this exception.
            raise
Exemplo n.º 7
0
  def _dispatch(self, method, args):
    """
    We provide a _dispatch function (which SimpleXMLRPCServer looks for and
    uses) so that we can log exceptions due to our programming errors within
    thelockserver as well to detect incorrect usage by clients. When an
    internal lockserver error is detected, this method will signal to the main
    server thread to shutdown.
    """
    global lockserver_had_error
      
    try:
      # Get the requested function (making sure it exists).
      try:
        func = getattr(self, method)
      except AttributeError:
        raise LockserverInvalidRequestError("The requested method '" + method + "' doesn't exist.")
      
      # Call the requested function.
      return func(*args)
    
    except LockserverInvalidRequestError:
      log.error("The lockserver was used incorrectly: " + traceback.format_exc())
      raise
    
    except:
      # We assume all other exceptions are bugs in the lockserver.
      # If there is a bug in the lockserver, that's really bad. We terminate the
      # lockserver in this case rather than risk incorrect locking behavior.
      
      # This will tell the server thread to exit.
      lockserver_had_error = True

      message = "The lockserver had an internal error and is exiting." + traceback.format_exc()
      log.critical(message)

      # Send an email to the addresses listed in settings.ADMINS
      if not settings.DEBUG:
        subject = "Critical SeattleGeni lockserver error"
        django.core.mail.mail_admins(subject, message)
      
      # This request will likely end up seeing an xmlrpclib.ProtocolError due to the
      # shutdown, regardless of this exception.
      raise
Exemplo n.º 8
0
def generate_keypair():
  """
  <Purpose>
    Obtain a new (unused) public/private keypair.
  <Arguments>
    None
  <Exceptions>
    None
  <Side Effects>
    Requests a key from the keygen daemon if USE_KEYDAEMON is True. If that
    fails or if USE_KEYDAEMON is False, directly generates a key.
  <Returns>
    A tuple in the format (pubkeystr, privkeystr).
  """
  
  if USE_KEYDAEMON:
    try:
      return _generate_keypair_from_key_daemon()
    except:
      log.critical("Unable to generate key from key daemon, falling back to " + 
                   "manual key generation. This may be very slow. The error " +
                   " from the key daemon was: " + traceback.format_exc())

  return _generate_keypair_directly()
Exemplo n.º 9
0
def cleanup_vessels():
  """
  This function is started as separate thread. It continually checks whether
  there are vessels needing to be cleaned up and initiates cleanup as needed.
  """
  
  log.info("[cleanup_vessels] cleanup thread started.")

  # Start a transaction management.
  django.db.transaction.enter_transaction_management()

  # Run forever.
  while True:
    
    try:
      
      # Sleep a few seconds for those times where we don't have any vessels to clean up.
      time.sleep(5)
      
      # We shouldn't be running the backend in production with
      # settings.DEBUG = True. Just in case, though, tell django to reset its
      # list of saved queries each time through the loop. Note that this is not
      # specific to the cleanup thread as other parts of the backend are using
      # the maindb, as well, so we're overloading the purpose of the cleanup
      # thread by doing this here. This is just a convenient place to do it.
      # See http://docs.djangoproject.com/en/dev/faq/models/#why-is-django-leaking-memory
      # for more info.
      if settings.DEBUG:
        django.db.reset_queries()
      
      # First, make it so that expired vessels are seen as dirty. We aren't
      # holding a lock on the nodes when we do this. It's possible that we do
      # this while someone else has a lock on the node. What would result?
      # I believe the worst result is that a user has their vessel marked as
      # dirty after they renewed in the case where they are renewing it just
      # as it expires (with some exceptionally bad timing involved). And, 
      # that's not really very bad as if the user is trying to renew at the
      # exact moment it expires, their trying their luck with how fast their
      # request gets processed, anyways. In short, I don't think it's important
      # enough to either obtain locks to do this or to rewrite the code to
      # avoid any need for separately marking expired vessels as dirty rather
      # than just trying to process expired vessels directly in the code below.
      date_started=datetime.datetime.now()
      expired_list = maindb.mark_expired_vessels_as_dirty()
      if len(expired_list) > 0:
        log.info("[cleanup_vessels] " + str(len(expired_list)) + 
                 " expired vessels have been marked as dirty: " + str(expired_list))
        maindb.create_action_log_event("mark_expired_vessels_as_dirty", user=None, second_arg=None,
                                       third_arg=None, was_successful=True, message=None,
                                       date_started=date_started, vessel_list=expired_list)

      # Get a list of vessels to clean up. This doesn't include nodes known to
      # be inactive as we would just continue failing to communicate with nodes
      # that are down.
      cleanupvessellist = maindb.get_vessels_needing_cleanup()
      if len(cleanupvessellist) == 0:
        continue
        
      log.info("[cleanup_vessels] " + str(len(cleanupvessellist)) + " vessels to clean up: " + str(cleanupvessellist))
      
      parallel_results = parallel.run_parallelized(cleanupvessellist, _cleanup_single_vessel)
        
      if len(parallel_results["exception"]) > 0:
        for vessel, exception_message in parallel_results["exception"]:
          log_message = "Unhandled exception during parallelized vessel cleanup: " + exception_message
          log.critical(log_message)
        # Raise the last exceptions so that the admin gets an email.
        raise InternalError(log_message)  
    
    except:
      message = "[cleanup_vessels] Something very bad happened: " + traceback.format_exc()
      log.critical(message)
      
      # Send an email to the addresses listed in settings.ADMINS
      if not settings.DEBUG:
        subject = "Critical SeattleGeni backend error"
        django.core.mail.mail_admins(subject, message)
        
        # Sleep for ten minutes to make sure we don't flood the admins with error
        # report emails.
        time.sleep(600)
    finally:
      # Manually commit the transaction to prevent caching.
      django.db.transaction.commit()
Exemplo n.º 10
0
def main():
  """
  This will run an infinite loop of checks over all of the active nodes in the
  database.
  """
  
  lockserver_handle = lockserver.create_lockserver_handle()

  # Always try to release the lockserver handle, though it's probably not
  # very useful in this case.
  try:
    
    while True:
      
      # Catch unexpected exceptions to log/send mail.
      try:
      
        # We shouldn't be running in production with settings.DEBUG = True. 
        # Just in case, though, tell django to reset its list of saved queries
        # each time through the loop.
        if settings.DEBUG:
          django.db.reset_queries()
        
        # Note: although we include broken but active nodes, we don't change
        # the status of broken nodes to be not broken yet if we don't detect
        # any problems. For now, most of the reason we include broken nodes
        # is so that we can tell which broken nodes are still online. This is
        # because it's not as big of a concern to have a broken node that is
        # quickly offline (e.g. broken nodes in development), but having one be
        # online for an extended period of time is a stronger signal of
        # potentially unknown bugs in the seattlegeni or seattle code.
        active_nodes = maindb.get_active_nodes_include_broken()
        log.info("Starting check of " + str(len(active_nodes)) + " active nodes.")
      
        checked_node_count = 0
        
        for node in active_nodes:
          
          checked_node_count += 1
          log.info("Checking node " + str(checked_node_count) + ": " + str(node))
          
          nodestatus.check_node(node, readonly=READONLY, lockserver_handle=lockserver_handle)
          
        # Print summary info.
        log.info("Nodes checked: " + str(checked_node_count))
        nodes_with_problems = nodestatus.get_node_problem_info()
        nodes_with_problems_count = len(nodes_with_problems.keys())
        log.info("Nodes without problems: " + str(checked_node_count - nodes_with_problems_count))
        log.info("Nodes with problems: " + str(nodes_with_problems_count))
        
        # Print information about the database changes made.
        log.info("Number of database actions taken:")
        actionstaken = nodestatus.get_actions_taken()
        for actionname in actionstaken:
          log.info("\t" + actionname + ": " + str(len(actionstaken[actionname])) + 
                   " " + str(actionstaken[actionname]))
    
        nodestatus.reset_collected_data()
        
        log.info("Sleeping for " + str(SLEEP_SECONDS_BETWEEN_RUNS) + " seconds.")
        time.sleep(SLEEP_SECONDS_BETWEEN_RUNS)
  
      except KeyboardInterrupt:
        raise
  
      except:
        message = "Unexpected exception in check_active_db_nodes.py: " + traceback.format_exc()
        log.critical(message)
    
        # Send an email to the addresses listed in settings.ADMINS
        if not settings.DEBUG:
          subject = "Critical SeattleGeni check_active_db_nodes.py error"
          django.core.mail.mail_admins(subject, message)
          
          # Sleep for ten minutes to make sure we don't flood the admins with error
          # report emails.
          time.sleep(600)

  finally:
    lockserver.destroy_lockserver_handle(lockserver_handle)
Exemplo n.º 11
0
 def process_exception(self, request, exception):
   log.critical("An unhandled exception resulted from a request: " + traceback.format_exc())
   
   # Returning None indicates that default exception handling should be done.
   return None
Exemplo n.º 12
0
    
    except NodemanagerCommunicationError, e:
      raise xmlrpclib.Fault(100, "Node communication failure: " + str(e))
    
    except (DoesNotExistError, InvalidRequestError, AssertionError):
      log.error("The backend was used incorrectly: " + traceback.format_exc())
      raise
    
    except:
      # We assume all other exceptions are bugs in the backend. Unlike the
      # lockserver where it might result in broader data corruption, here in
      # the backend we allow the backend to continue serving other requests.
      # That is, we don't go through steps to try to shutdown the backend.
      
      message = "The backend had an internal error: " + traceback.format_exc()
      log.critical(message)
      
      # Send an email to the addresses listed in settings.ADMINS
      if not settings.DEBUG:
        subject = "Critical SeattleGeni backend error"
        django.core.mail.mail_admins(subject, message)
      
      raise
      




  # Using @staticmethod makes it so that 'self' doesn't get passed in as the first arg.
  @staticmethod
  @log_function_call
Exemplo n.º 13
0
            raise xmlrpclib.Fault(100, "Node communication failure: " + str(e))

        except (DoesNotExistError, InvalidRequestError, AssertionError):
            log.error("The backend was used incorrectly: " +
                      traceback.format_exc())
            raise

        except:
            # We assume all other exceptions are bugs in the backend. Unlike the
            # lockserver where it might result in broader data corruption, here in
            # the backend we allow the backend to continue serving other requests.
            # That is, we don't go through steps to try to shutdown the backend.

            message = "The backend had an internal error: " + traceback.format_exc(
            )
            log.critical(message)

            # Send an email to the addresses listed in settings.ADMINS
            if not settings.DEBUG:
                subject = "Critical SeattleGeni backend error"
                django.core.mail.mail_admins(subject, message)

            raise

    # Using @staticmethod makes it so that 'self' doesn't get passed in as the first arg.
    @staticmethod
    @log_function_call
    def GenerateKey(*args):
        """
    This is a public function of the XMLRPC server. See the module comments at
    the top of the file for a description of how it is used.
Exemplo n.º 14
0
    def _dispatch(self, method, args):
        """
    We provide a _dispatch function (which SimpleXMLRPCServer looks for and
    uses) so that we can log exceptions due to our programming errors within
    seattlegeni as well to detect incorrect usage by clients.
    """

        try:
            # Get the requested function (making sure it exists).
            try:
                func = getattr(self, method)
            except AttributeError:
                raise InvalidRequestError("The requested method '" + method + "' doesn't exist.")

            # Call the requested function.
            return func(*args)

        except InvalidRequestError:
            log.error("The xmlrpc server was used incorrectly: " + traceback.format_exc())
            raise

        except xmlrpclib.Fault:
            # A xmlrpc Fault was intentionally raised by the code in this module.
            raise

        except Exception, e:
            # We assume all other exceptions are bugs in our code.

            # We use the log message as the basis for the email message, as well.
            logmessage = "Internal error while handling an xmlrpc request: " + traceback.format_exc()
            log.critical(logmessage)

            # Normally django will send an email to the ADMINS defined in settings.py
            # when an exception occurs. However, our xmlrpc dispatcher will turn this
            # into a Fault that is returned to the client. So, django won't see it as
            # an uncaught exception. Therefore, we have to send it ourselves.
            if not settings.DEBUG:
                subject = "Error handling xmlrpc request '" + method + "': " + str(type(e)) + " " + str(e)

                emailmessage = logmessage + "\n\n"
                emailmessage += "XMLRPC method called: " + method + "\n"

                # If the first argument looks like auth info, don't include the
                # api_key in the email we send. Otherwise, include all the args.
                # We wrap this in a try block just in case we screw this up we want to
                # be sure we get an email still.
                try:
                    if len(args) > 0 and isinstance(args[0], dict) and "username" in args[0]:
                        emailmessage += "Username: "******"username"]) + "\n"
                        if len(args) > 1:
                            emailmessage += "Non-auth arguments: " + str(args[1:]) + "\n"
                        else:
                            emailmessage += "There were no non-auth arguments." + "\n"
                    else:
                        emailmessage += "Arguments: " + str(args) + "\n"
                except:
                    pass

                # Send an email to the addresses listed in settings.ADMINS
                django.core.mail.mail_admins(subject, emailmessage)

            # It's not unlikely that the user ends up seeing this message, so we
            # are careful about what the content of the message is. We don't
            # include the exception trace.
            raise xmlrpclib.Fault(FAULTCODE_INTERNALERROR, "Internal error while handling the xmlrpc request.")
Exemplo n.º 15
0
def cleanup_vessels():
    """
  This function is started as separate thread. It continually checks whether
  there are vessels needing to be cleaned up and initiates cleanup as needed.
  """

    log.info("[cleanup_vessels] cleanup thread started.")

    # Start a transaction management.
    django.db.transaction.enter_transaction_management()

    # Run forever.
    while True:

        try:

            # Sleep a few seconds for those times where we don't have any vessels to clean up.
            time.sleep(5)

            # We shouldn't be running the backend in production with
            # settings.DEBUG = True. Just in case, though, tell django to reset its
            # list of saved queries each time through the loop. Note that this is not
            # specific to the cleanup thread as other parts of the backend are using
            # the maindb, as well, so we're overloading the purpose of the cleanup
            # thread by doing this here. This is just a convenient place to do it.
            # See http://docs.djangoproject.com/en/dev/faq/models/#why-is-django-leaking-memory
            # for more info.
            if settings.DEBUG:
                django.db.reset_queries()

            # First, make it so that expired vessels are seen as dirty. We aren't
            # holding a lock on the nodes when we do this. It's possible that we do
            # this while someone else has a lock on the node. What would result?
            # I believe the worst result is that a user has their vessel marked as
            # dirty after they renewed in the case where they are renewing it just
            # as it expires (with some exceptionally bad timing involved). And,
            # that's not really very bad as if the user is trying to renew at the
            # exact moment it expires, their trying their luck with how fast their
            # request gets processed, anyways. In short, I don't think it's important
            # enough to either obtain locks to do this or to rewrite the code to
            # avoid any need for separately marking expired vessels as dirty rather
            # than just trying to process expired vessels directly in the code below.
            date_started = datetime.datetime.now()
            expired_list = maindb.mark_expired_vessels_as_dirty()
            if len(expired_list) > 0:
                log.info("[cleanup_vessels] " + str(len(expired_list)) +
                         " expired vessels have been marked as dirty: " +
                         str(expired_list))
                maindb.create_action_log_event("mark_expired_vessels_as_dirty",
                                               user=None,
                                               second_arg=None,
                                               third_arg=None,
                                               was_successful=True,
                                               message=None,
                                               date_started=date_started,
                                               vessel_list=expired_list)

            # Get a list of vessels to clean up. This doesn't include nodes known to
            # be inactive as we would just continue failing to communicate with nodes
            # that are down.
            cleanupvessellist = maindb.get_vessels_needing_cleanup()
            if len(cleanupvessellist) == 0:
                continue

            log.info("[cleanup_vessels] " + str(len(cleanupvessellist)) +
                     " vessels to clean up: " + str(cleanupvessellist))

            parallel_results = parallel.run_parallelized(
                cleanupvessellist, _cleanup_single_vessel)

            if len(parallel_results["exception"]) > 0:
                for vessel, exception_message in parallel_results["exception"]:
                    log_message = "Unhandled exception during parallelized vessel cleanup: " + exception_message
                    log.critical(log_message)
                # Raise the last exceptions so that the admin gets an email.
                raise InternalError(log_message)

        except:
            message = "[cleanup_vessels] Something very bad happened: " + traceback.format_exc(
            )
            log.critical(message)

            # Send an email to the addresses listed in settings.ADMINS
            if not settings.DEBUG:
                subject = "Critical SeattleGeni backend error"
                django.core.mail.mail_admins(subject, message)

                # Sleep for ten minutes to make sure we don't flood the admins with error
                # report emails.
                time.sleep(600)
        finally:
            # Manually commit the transaction to prevent caching.
            django.db.transaction.commit()
Exemplo n.º 16
0
def main():
    """
  This will run an infinite loop of checks over all of the active nodes in the
  database.
  """

    lockserver_handle = lockserver.create_lockserver_handle()

    # Always try to release the lockserver handle, though it's probably not
    # very useful in this case.
    try:

        while True:

            # Catch unexpected exceptions to log/send mail.
            try:

                # We shouldn't be running in production with settings.DEBUG = True.
                # Just in case, though, tell django to reset its list of saved queries
                # each time through the loop.
                if settings.DEBUG:
                    django.db.reset_queries()

                # Note: although we include broken but active nodes, we don't change
                # the status of broken nodes to be not broken yet if we don't detect
                # any problems. For now, most of the reason we include broken nodes
                # is so that we can tell which broken nodes are still online. This is
                # because it's not as big of a concern to have a broken node that is
                # quickly offline (e.g. broken nodes in development), but having one be
                # online for an extended period of time is a stronger signal of
                # potentially unknown bugs in the seattlegeni or seattle code.
                active_nodes = maindb.get_active_nodes_include_broken()
                log.info("Starting check of " + str(len(active_nodes)) +
                         " active nodes.")

                checked_node_count = 0

                for node in active_nodes:

                    checked_node_count += 1
                    log.info("Checking node " + str(checked_node_count) +
                             ": " + str(node))

                    nodestatus.check_node(node,
                                          readonly=READONLY,
                                          lockserver_handle=lockserver_handle)

                # Print summary info.
                log.info("Nodes checked: " + str(checked_node_count))
                nodes_with_problems = nodestatus.get_node_problem_info()
                nodes_with_problems_count = len(nodes_with_problems.keys())
                log.info("Nodes without problems: " +
                         str(checked_node_count - nodes_with_problems_count))
                log.info("Nodes with problems: " +
                         str(nodes_with_problems_count))

                # Print information about the database changes made.
                log.info("Number of database actions taken:")
                actionstaken = nodestatus.get_actions_taken()
                for actionname in actionstaken:
                    log.info("\t" + actionname + ": " +
                             str(len(actionstaken[actionname])) + " " +
                             str(actionstaken[actionname]))

                nodestatus.reset_collected_data()

                log.info("Sleeping for " + str(SLEEP_SECONDS_BETWEEN_RUNS) +
                         " seconds.")
                time.sleep(SLEEP_SECONDS_BETWEEN_RUNS)

            except KeyboardInterrupt:
                raise

            except:
                message = "Unexpected exception in check_active_db_nodes.py: " + traceback.format_exc(
                )
                log.critical(message)

                # Send an email to the addresses listed in settings.ADMINS
                if not settings.DEBUG:
                    subject = "Critical SeattleGeni check_active_db_nodes.py error"
                    django.core.mail.mail_admins(subject, message)

                    # Sleep for ten minutes to make sure we don't flood the admins with error
                    # report emails.
                    time.sleep(600)

    finally:
        lockserver.destroy_lockserver_handle(lockserver_handle)
Exemplo n.º 17
0
  def _dispatch(self, method, args):
    """
    We provide a _dispatch function (which SimpleXMLRPCServer looks for and
    uses) so that we can log exceptions due to our programming errors within
    seattlegeni as well to detect incorrect usage by clients.
    """
      
    try:
      # Get the requested function (making sure it exists).
      try:
        func = getattr(self, method)
      except AttributeError:
        raise InvalidRequestError("The requested method '" + method + "' doesn't exist.")
      
      # Call the requested function.
      return func(*args)
    
    except InvalidRequestError:
      log.error("The xmlrpc server was used incorrectly: " + traceback.format_exc())
      raise
    
    except xmlrpclib.Fault:
      # A xmlrpc Fault was intentionally raised by the code in this module.
      raise
    
    except Exception, e:
      # We assume all other exceptions are bugs in our code.

      # We use the log message as the basis for the email message, as well.
      logmessage = "Internal error while handling an xmlrpc request: " + traceback.format_exc()
      log.critical(logmessage)

      # Normally django will send an email to the ADMINS defined in settings.py
      # when an exception occurs. However, our xmlrpc dispatcher will turn this
      # into a Fault that is returned to the client. So, django won't see it as
      # an uncaught exception. Therefore, we have to send it ourselves.
      if not settings.DEBUG:
        subject = "Error handling xmlrpc request '" + method + "': " + str(type(e)) + " " + str(e)
        
        emailmessage = logmessage + "\n\n"
        emailmessage += "XMLRPC method called: " + method + "\n"
        
        # If the first argument looks like auth info, don't include the
        # api_key in the email we send. Otherwise, include all the args.
        # We wrap this in a try block just in case we screw this up we want to
        # be sure we get an email still.
        try:
          if len(args) > 0 and isinstance(args[0], dict) and "username" in args[0]:
            emailmessage += "Username: "******"username"]) + "\n"
            if len(args) > 1:
              emailmessage += "Non-auth arguments: " + str(args[1:]) + "\n"
            else:
              emailmessage += "There were no non-auth arguments." + "\n"
          else:
            emailmessage += "Arguments: " + str(args) + "\n"
        except:
          pass
          
        # Send an email to the addresses listed in settings.ADMINS
        django.core.mail.mail_admins(subject, emailmessage)
      
      # It's not unlikely that the user ends up seeing this message, so we
      # are careful about what the content of the message is. We don't
      # include the exception trace.
      raise xmlrpclib.Fault(FAULTCODE_INTERNALERROR, "Internal error while handling the xmlrpc request.")