示例#1
0
def flag_set(section, setting):
    """True if the given boolean setting is enabled in openquake.cfg

    :param string section: name of the configuration file section
    :param string setting: name of the configuration file setting

    :returns: True if the setting is enabled in openquake.cfg, False otherwise
    """
    setting = get(section, setting)
    if setting is None:
        return False
    return general.str2bool(setting)
示例#2
0
def flag_set(section, setting):
    """True if the given boolean setting is enabled in openquake.cfg

    :param string section: name of the configuration file section
    :param string setting: name of the configuration file setting

    :returns: True if the setting is enabled in openquake.cfg, False otherwise
    """
    setting = get(section, setting)
    if setting is None:
        return False
    return general.str2bool(setting)
示例#3
0
 def check_nodes(self):
     """
     Check that the expected celery nodes are all up. The loop
     continues until the main thread keeps running.
     """
     while self.job_is_running(sleep=self.interval):
         live_nodes = self.ping(timeout=self.interval)
         if live_nodes < self.live_nodes:
             dead_nodes = list(self.live_nodes - live_nodes)
             logs.LOG.critical(
                 'Cluster nodes not accessible: %s', dead_nodes)
             terminate = general.str2bool(
                 config.get('celery', 'terminate_job_when_celery_is_down'))
             if terminate:
                 os.kill(os.getpid(), signal.SIGABRT)  # commit suicide
示例#4
0
def no_distribute():
    """
    Check the `OQ_NO_DISTRIBUTE` environment var to determine if calculations
    should be distributed or not.

    :returns:
        `True` if the envvar value is "true", "yes", "t", or "1", regardless of
        case. Otherwise, return `False`.

        If the variable is undefined, it defaults to `False`.
    """
    nd = os.environ.get(NO_DISTRIBUTE_VAR)

    if nd is None:
        return False
    else:
        return general_utils.str2bool(nd)
示例#5
0
def no_distribute():
    """
    Check the `OQ_NO_DISTRIBUTE` environment var to determine if calculations
    should be distributed or not.

    :returns:
        `True` if the envvar value is "true", "yes", "t", or "1", regardless of
        case. Otherwise, return `False`.

        If the variable is undefined, it defaults to `False`.
    """
    nd = os.environ.get(NO_DISTRIBUTE_VAR)

    if nd is None:
        return False
    else:
        return general_utils.str2bool(nd)
示例#6
0
from openquake import hazardlib
from openquake import risklib
from openquake import nrmllib


INPUT_TYPES = dict(models.INPUT_TYPE_CHOICES)

UNABLE_TO_DEL_HC_FMT = 'Unable to delete hazard calculation: %s'
UNABLE_TO_DEL_RC_FMT = 'Unable to delete risk calculation: %s'

LOG_FORMAT = ('[%(asctime)s %(calc_domain)s #%(calc_id)s %(hostname)s '
              '%(levelname)s %(processName)s/%(process)s %(name)s] '
              '%(message)s')

TERMINATE = general.str2bool(
    config.get('celery', 'terminate_workers_on_revoke'))


def cleanup_after_job(job, terminate):
    """
    Release the resources used by an openquake job.
    In particular revoke the running tasks (if any).

    :param int job_id: the job id
    :param bool terminate: the celery revoke command terminate flag
    """
    # Using the celery API, terminate and revoke and terminate any running
    # tasks associated with the current job.
    task_ids = Performance.objects.filter(
        oq_job=job, operation='storing task id', task_id__isnull=False)\
        .values_list('task_id', flat=True)
示例#7
0
class SupervisorLogMessageConsumer(logs.AMQPLogSource):
    """
    Supervise an OpenQuake job by:

       - handling its "critical" and "error" messages
       - periodically checking that the job process is still running
    """
    # Failure counter check delay, translates to 60 seconds with the current
    # settings.
    FCC_DELAY = 60
    terminate = general.str2bool(
        config.get('celery', 'terminate_workers_on_revoke'))

    def __init__(self, job_id, job_pid, timeout=1):
        self.job_id = job_id
        job = OqJob.objects.get(id=job_id)
        self.calc_id = job.calculation.id
        if job.hazard_calculation is not None:
            self.calc_domain = 'hazard'
        else:
            self.calc_domain = 'risk'

        self.selflogger = logging.getLogger('oq.%s.%s.supervisor' %
                                            (self.calc_domain, self.calc_id))
        self.selflogger.debug('Entering supervisor for %s calc %s' %
                              (self.calc_domain, self.calc_id))
        logger_name = 'oq.%s.%s' % (self.calc_domain, self.calc_id)
        key = '%s.#' % logger_name
        super(SupervisorLogMessageConsumer, self).__init__(timeout=timeout,
                                                           routing_key=key)
        self.job_pid = job_pid
        self.joblogger = logging.getLogger(logger_name)
        self.jobhandler = logging.Handler(logging.ERROR)
        self.jobhandler.emit = self.log_callback
        self.joblogger.addHandler(self.jobhandler)
        # Failure counter check delay value
        self.fcc_delay_value = 0

    def run(self):
        """
        Wrap superclass' method just to add cleanup.
        """
        started = datetime.utcnow()
        super(SupervisorLogMessageConsumer, self).run()
        stopped = datetime.utcnow()
        self.selflogger.info(
            '%s calc %s finished in %s' %
            (self.calc_domain, self.calc_id, stopped - started))
        self.joblogger.removeHandler(self.jobhandler)
        self.selflogger.debug('Exiting supervisor for %s calc %s' %
                              (self.calc_domain, self.calc_id))

    def log_callback(self, record):
        """
        Handles messages of severe level from the supervised job.
        """
        if record.name == self.selflogger.name:
            # ignore error log messages sent by selflogger.
            # this way we don't try to kill the job if its
            # process has crashed (or has been stopped).
            # we emit selflogger's error messages from
            # timeout_callback().
            return

        terminate_job(self.job_pid)

        update_job_status(self.job_id)

        record_job_stop_time(self.job_id)

        cleanup_after_job(self.job_id, self.terminate)

        self.stop()

    def timeout_callback(self):
        """
        On timeout expiration check if the job process is still running
        and whether it experienced any failures.

        Terminate the job process in the latter case.
        """
        def failure_counters_need_check():
            """Return `True` if failure counters should be checked."""
            self.fcc_delay_value += 1
            result = self.fcc_delay_value >= self.FCC_DELAY
            if result:
                self.fcc_delay_value = 0
            return result

        process_stopped = job_failed = False
        message = None

        if not supervising.is_pid_running(self.job_pid):
            message = ('job process %s crashed or terminated' % self.job_pid)
            process_stopped = True
        elif failure_counters_need_check():
            # Job process is still running.
            failures = stats.failure_counters(self.job_id)
            failed_nodes = None
            if failures:
                message = "job terminated with failures: %s" % failures
            else:
                # Don't check for failed nodes if distribution is disabled.
                # In this case, we don't expect any nodes to be present, and
                # thus, there are none that can fail.
                if not openquake.engine.no_distribute():
                    failed_nodes = abort_due_to_failed_nodes(self.job_id)
                    if failed_nodes:
                        message = ("job terminated due to %s failed nodes" %
                                   failed_nodes)
            if failures or failed_nodes:
                terminate_job(self.job_pid)
                job_failed = True

        if job_failed or process_stopped:
            job_status = get_job_status(self.job_id)
            if process_stopped and job_status == 'complete':
                message = 'job process %s succeeded' % self.job_pid
                self.selflogger.debug(message)
            elif not job_status == 'complete':
                # The job crashed without having a chance to update the
                # status in the database, or it has been running even though
                # there were failures. We update the job status here.
                self.selflogger.error(message)
                update_job_status(self.job_id)

            record_job_stop_time(self.job_id)
            cleanup_after_job(self.job_id, self.terminate)
            raise StopIteration()