예제 #1
0
def main(config_file, std_err=False, verbose=True, clean_histdata=False):
    """
    Main function of the script

    Args:
        config_file: file path of the config file to load
        std_err: whether print logging output to stderr
        verbose: whether to provide verbose logging messages
        clean_histdata: all historical data should be cleared
    """

    try:
        # Configure logging:
        fmt = logging.Formatter('%(filename)s[%(process)d] %(levelname)s: ' +
                                '%(message)s')
        logger = logging.getLogger()
        if verbose:
            logger.setLevel(logging.DEBUG)
        else:
            logger.setLevel(logging.INFO)
        if std_err:
            handler = logging.StreamHandler()
        else:
            handler = lh.SysLogHandler(address='/dev/log',
                                       facility=lh.SysLogHandler.LOG_USER)
        handler.setFormatter(fmt)
        logger.addHandler(handler)

        logger.debug("{0} is starting, ".format(os.path.basename(__file__)) +
                     "command line arguments: " +
                     "config_file={0}, ".format(config_file) +
                     "std_err={0}, ".format(std_err) +
                     "verbose={0}, ".format(verbose) +
                     "clean_histdata={0}".format(clean_histdata)
                     )

        # FIXME - Remember to correctly configure syslog, otherwise rsyslog will
        # discard messages
        ScriptConfiguration.load_config(config_file)

        logger.debug("Loaded configuration: " +
                     str(ScriptConfiguration.get_config())
                     )

        # Initialize reporting to monitoring system:
        ScriptStatus.init(nrpe_enable=True)

        # Make sure that we are the only ones running on the server:
        ScriptLock.init(ScriptConfiguration.get_val('lockfile'))
        ScriptLock.aqquire()

        # Some basic sanity checking:
        verify_conf()

        # We are all set, lets do some real work:
        HistoryFile.init(location=ScriptConfiguration.get_val('history_file'),
                         max_averaging_window=ScriptConfiguration.get_val(
                             'max_averaging_window'),
                         min_averaging_window=ScriptConfiguration.get_val(
                             'min_averaging_window'))

        if clean_histdata:
            HistoryFile.clear_history()
            HistoryFile.save()
            ScriptStatus.notify_immediate('unknown',
                                          'History data has been cleared.')

        timeframe = ScriptConfiguration.get_val('timeframe')

        # FIXME: not sure how to refactor this, copypaste does not seem the best
        # solution :(
        def do_status_processing(prefix, current_growth, planned_growth,
                                 mountpoint=None, data_type=None):
            warn_tresh = 1 + (ScriptConfiguration.get_val(
                prefix + '_mon_warn_reduction')/100)
            crit_tresh = 1 + (ScriptConfiguration.get_val(
                prefix + '_mon_crit_reduction')/100)

            if prefix == 'disk' and data_type == 'inode':
                units = 'inodes/day'
            else:
                units = 'MB/day'

            if prefix == 'disk':
                rname = data_type + \
                    ' usage growth for mount {0}'.format(mountpoint)
            else:
                rname = '{0} usage growth'.format(prefix)

            rname = rname.capitalize()

            if current_growth > planned_growth * warn_tresh:
                msg = '{0} exceeds planned growth '.format(rname) + \
                      '- current: {0} {1}'.format(current_growth, units) + \
                      ', planned: {0} {1}.'.format(planned_growth, units)
                if current_growth > planned_growth * crit_tresh:
                    ScriptStatus.update('crit', msg)
                else:
                    ScriptStatus.update('warn', msg)
            else:
                ScriptStatus.update('ok',
                                    '{0} is OK ({1} {2}).'.format(
                                        rname, current_growth, units))

        if ScriptConfiguration.get_val('memory_mon_enabled'):
            cur_usage, max_usage = fetch_memory_usage()
            HistoryFile.add_datapoint('memory', cur_usage)
            tmp = HistoryFile.verify_dataspan('memory')
            if tmp < 0:
                ScriptStatus.update('unknown', 'There is not enough data ' +
                                    'to calculate current memory ' +
                                    'usage growth: {0} '.format(abs(tmp)) +
                                    'days more is needed.')
            else:
                datapoints = HistoryFile.get_datapoints('memory')

                planned_growth = find_planned_grow_ratio(cur_usage, max_usage,
                                                         timeframe)
                current_growth = find_current_grow_ratio(datapoints)

                logging.debug('memory -> ' +
                              'current_growth: {0}, '.format(current_growth) +
                              'planned_growth: {0}'.format(planned_growth))

                do_status_processing('memory', current_growth, planned_growth)

        if ScriptConfiguration.get_val('disk_mon_enabled'):
            mountpoints = ScriptConfiguration.get_val('disk_mountpoints')
            for dtype in ['space', 'inode']:
                for mountpoint in mountpoints:
                    if dtype == 'inode':
                        cur_usage, max_usage = fetch_inode_usage(mountpoint)
                    else:
                        cur_usage, max_usage = fetch_disk_usage(mountpoint)
                    HistoryFile.add_datapoint('disk', cur_usage,
                                              data_type=dtype,
                                              path=mountpoint)
                    tmp = HistoryFile.verify_dataspan('disk',
                                                      data_type=dtype,
                                                      path=mountpoint)
                    if tmp < 0:
                        ScriptStatus.update('unknown',
                                            'There is not enough data to ' +
                                            'calculate current disk ' + dtype +
                                            ' usage growth for mountpoint ' +
                                            '{0}: {1} '.format(
                                                mountpoint, abs(tmp)) +
                                            'days more is needed.')
                    else:
                        datapoints = HistoryFile.get_datapoints('disk',
                                                                data_type=dtype,
                                                                path=mountpoint)
                        planned_growth = find_planned_grow_ratio(cur_usage,
                                                                 max_usage,
                                                                 timeframe)
                        current_growth = find_current_grow_ratio(datapoints)

                        logging.debug('disk, ' +
                                      'mountpoint {0}, '.format(mountpoint) +
                                      'data_type {0}: '.format(dtype) +
                                      'current_growth: {0}'.format(current_growth) +
                                      'planned_growth: {0}'.format(planned_growth))
                        do_status_processing('disk', current_growth, planned_growth,
                                             mountpoint=mountpoint, data_type=dtype)

        HistoryFile.save()
        ScriptStatus.notify_agregated()
        ScriptLock.release()

    except RecoverableException as e:
        msg = str(e)
        logging.critical(msg)
        ScriptStatus.notify_immediate('unknown', msg)
        sys.exit(1)
    except AssertionError as e:
        # Unittests require it:
        raise
    except Exception as e:
        msg = "Exception occured: {0}".format(e.__class__.__name__)
        logging.exception(msg)
        print(msg)  # We can use notify immediate here :(
        sys.exit(3)