예제 #1
0
    def test_cache(self):
        """Test the caching"""
        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)

        n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user)
        message = "mywarning"
        n.warning(message)
        os.close(handle)

        self.buffo.seek(0)
        self.buffo.truncate(0)

        raised_exception = None
        try:
            reporter_test = NagiosReporter('test_cache', filename, -1,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit as err:
            raised_exception = err
        bo = self.buffo.getvalue().rstrip()

        self.assertEqual(bo, "WARNING %s" % message)
        self.assertEqual(raised_exception.code, NAGIOS_EXIT_WARNING[0])

        statres = os.stat(filename)

        self.assertFalse(statres.st_mode & stat.S_IROTH)
예제 #2
0
    def test_cache(self):
        """Test the caching mechanism in the reporter."""
        length = random.randint(1, 30)
        exit_code = random.randint(0, 3)
        threshold = random.randint(0, 10)

        message = ''.join(random.choice(string.printable) for x in range(length))
        message = message.rstrip()

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        os.close(handle)
        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)

        nagios_exit = [NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL, NAGIOS_EXIT_UNKNOWN][exit_code]

        reporter.cache(nagios_exit, message)

        (handle, output_filename) = tempfile.mkstemp()
        os.close(handle)

        try:
            old_stdout = sys.stdout
            buffer = StringIO.StringIO()
            sys.stdout = buffer
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            line = buffer.getvalue().rstrip()
            sys.stdout = old_stdout
            buffer.close()
            self.assertTrue(err.code == nagios_exit[0])
            self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
예제 #3
0
    def test_cache(self, exit_code, message, threshold):
        """Test the caching mechanism in the reporter."""
        message = message.rstrip()
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold,
                                  self.nagios_user)

        nagios_exit = [
            NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL,
            NAGIOS_EXIT_UNKNOWN
        ][exit_code]

        reporter.cache(nagios_exit, message)

        (handle, output_filename) = tempfile.mkstemp()
        os.close(handle)

        try:
            old_stdout = sys.stdout
            buffer = StringIO.StringIO()
            sys.stdout = buffer
            reporter_test = NagiosReporter('test_cache', filename, threshold,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            line = buffer.getvalue().rstrip()
            sys.stdout = old_stdout
            buffer.close()
            self.assertTrue(err.code == nagios_exit[0])
            self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
예제 #4
0
    def test_threshold(self, message="Hello"):
        """Test the threshold borking mechanism in the reporter."""
        message = message.rstrip()
        threshold = 1
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)

        # redirect stdout
        old_stdout = sys.stdout
        buff = StringIO.StringIO()
        sys.stdout = buff

        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            pass
def main(args):
    """Main script."""

    options = {
        'nagios': ('print out nagion information', None, 'store_true', False, 'n'),
        'nagios_check_filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME),
        'nagios_check_interval_threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD),
        'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users',
                        None, 'store_true', False),
        'ha': ('high-availability master IP address', None, 'store', None),
        'dry-run': ('do not make any updates whatsoever', None, 'store_true', False),
    }
    opts = simple_option(options)

    nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD)

    if opts.options.nagios:
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter(NAGIOS_EXIT_WARNING,
                        NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    try:
        vsc_config = VscConfiguration()
        LdapQuery(vsc_config)

        grace_users = get_user_with_status('grace')
        inactive_users = get_user_with_status('inactive')

        pbs_query = PBSQuery()

        t = time.ctime()
        jobs = pbs_query.getjobs()  # we just get them all

        removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run)
        removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run)

        if opts.options.mail_report and not opts.options.dry_run:
            if len(removed_queued) > 0 or len(removed_running) > 0:
                mail_report(t, removed_queued, removed_running)
    except Exception, err:
        logger.exception("Something went wrong: {err}".format(err=err))
        nagios_reporter.cache(NAGIOS_EXIT_CRITICAL,
                              NagiosResult("Script failed, check log file ({logfile})".format(logfile=PBS_CHECK_LOG_FILE)))
        sys.exit(NAGIOS_EXIT_CRITICAL)
예제 #6
0
    def test_world_readable(self):
        """Test world readable cache"""
        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)

        n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user, _world_readable=True)
        n.ok("test")
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, -1, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit:
            pass

        statres = os.stat(filename)

        self.assertTrue(statres.st_mode & stat.S_IROTH)
예제 #7
0
    def test_cache(self):
        """Test the caching"""
        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)

        n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user)
        message = "mywarning"
        n.warning(message)
        os.close(handle)

        self.buffo.seek(0)
        self.buffo.truncate(0)

        try:
            reporter_test = NagiosReporter('test_cache', filename, -1, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, e:
            pass
예제 #8
0
    def test_cache(self):
        """Test the caching"""
        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)

        n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user)
        message = "mywarning"
        n.warning(message)
        os.close(handle)

        self.buffo.seek(0)
        self.buffo.truncate(0)

        try:
            reporter_test = NagiosReporter('test_cache', filename, -1,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, e:
            pass
예제 #9
0
    def test_world_readable(self):
        """Test world readable cache"""
        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)

        n = SimpleNagios(_cache=filename,
                         _cache_user=self.nagios_user,
                         _world_readable=True)
        n.ok("test")
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, -1,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit:
            pass

        statres = os.stat(filename)

        self.assertTrue(statres.st_mode & stat.S_IROTH)
예제 #10
0
    def test_cache(self):
        """Test the caching mechanism in the reporter."""
        length = random.randint(1, 30)
        exit_code = random.randint(0, 3)
        threshold = random.randint(0, 10)

        message = ''.join(random.choice(string.printable) for x in range(length))
        message = message.rstrip()

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        os.close(handle)
        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)

        nagios_exit = [NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL, NAGIOS_EXIT_UNKNOWN][exit_code]

        reporter.cache(nagios_exit, message)

        (handle, output_filename) = tempfile.mkstemp()
        os.close(handle)

        try:
            old_stdout = sys.stdout
            buffer = StringIO.StringIO()
            sys.stdout = buffer
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            line = buffer.getvalue().rstrip()
            sys.stdout = old_stdout
            buffer.close()
            self.assertTrue(err.code == nagios_exit[0])
            self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
예제 #11
0
    def test_threshold(self, message="Hello"):
        """Test the threshold borking mechanism in the reporter."""
        message = message.rstrip()
        threshold = 1
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)

        # redirect stdout
        old_stdout = sys.stdout
        buff = StringIO.StringIO()
        sys.stdout = buff

        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            pass
예제 #12
0
    def test_cache(self, exit_code, message, threshold):
        """Test the caching mechanism in the reporter."""
        message = message.rstrip()
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user )

        nagios_exit = [NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL, NAGIOS_EXIT_UNKNOWN][exit_code]

        reporter.cache(nagios_exit, message)

        (handle, output_filename) = tempfile.mkstemp()
        os.close(handle)

        try:
            old_stdout = sys.stdout
            buffer = StringIO.StringIO()
            sys.stdout = buffer
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            line = buffer.getvalue().rstrip()
            sys.stdout = old_stdout
            buffer.close()
            self.assertTrue(err.code == nagios_exit[0])
            self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
예제 #13
0
    def test_threshold(self, message="Hello"):
        """Test the threshold borking mechanism in the reporter."""
        message = message.rstrip()
        threshold = 1
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold,
                                  self.nagios_user)

        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            self.assertEqual(
                err.code, NAGIOS_EXIT_OK[0],
                "Exit with status when the cached data is recent")
예제 #14
0
    def test_threshold(self, message="Hello"):
        """Test the threshold borking mechanism in the reporter."""
        message = message.rstrip()
        threshold = 1
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)

        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            self.assertEqual(err.code, NAGIOS_EXIT_OK[0],
                             "Exit with status when the cached data is recent")
예제 #15
0
        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            pass
        self.assertEqual(err.code, NAGIOS_EXIT_OK[0],
                         "Exit with status when the cached data is recent")
        # restore stdout
        buff.close()
        sys.stdout = old_stdout

        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
        reporter.cache(nagios_exit, message)
        time.sleep(threshold + 1)
        # redirect stdout
        old_stdout = sys.stdout
        buff = StringIO.StringIO()
        sys.stdout = buff
        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            pass

        line = buff.getvalue().rstrip()
        # restore stdout
        buff.close()
예제 #16
0
def main():
    """Main script"""

    options = {
        'nagios': ('print out nagios information', None, 'store_true', False, 'n'),
        'nagios-check-filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME),
        'nagios-check-interval-threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD),
        'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []),
        'dry-run': ('do not make any updates whatsoever', None, 'store_true', False),
    }
    opts = simple_option(options)

    logger.info('started GPFS quota check run.')

    nagios_reporter = NagiosReporter(NAGIOS_HEADER,
                                     opts.options.nagios_check_filename,
                                     opts.options.nagios_check_interval_threshold)

    if opts.options.nagios:
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    lockfile = TimestampedPidLockfile(QUOTA_CHECK_LOCK_FILE)
    lock_or_bork(lockfile, nagios_reporter)

    try:
        user_id_map = map_uids_to_names() # is this really necessary?
        LdapQuery(VscConfiguration())
        gpfs = GpfsOperations()
        filesystems = gpfs.list_filesystems().keys()
        logger.debug("Found the following GPFS filesystems: %s" % (filesystems))

        filesets = gpfs.list_filesets()
        logger.debug("Found the following GPFS filesets: %s" % (filesets))

        quota = gpfs.list_quota()

        for storage in opts.options.storage:

            logger.info("Processing quota for storage %s" % (storage))
            filesystem = opts.configfile_parser.get(storage, 'filesystem')

            if filesystem not in filesystems:
                logger.error("Non-existant filesystem %s" % (filesystem))
                continue

            if filesystem not in quota.keys():
                logger.error("No quota defined for storage %s [%s]" % (storage, filesystem))
                continue

            quota_storage_map = get_mmrepquota_maps(quota[filesystem], storage,filesystem, filesets)

            exceeding_filesets = process_fileset_quota(gpfs, storage, filesystem, quota_storage_map['FILESET'])
            exceeding_users = process_user_quota(gpfs, storage, filesystem, quota_storage_map['USR'], user_id_map)

            logger.warning("storage %s found %d filesets that are exceeding their quota: %s" % (storage,
                                                                                                len(exceeding_filesets),
                                                                                                exceeding_filesets))
            logger.warning("storage %s found %d users who are exceeding their quota: %s" % (storage,
                                                                                            len(exceeding_users),
                                                                                            exceeding_users))

            notify_exceeding_filesets(gpfs=gpfs,
                                      storage=storage,
                                      filesystem=filesystem,
                                      exceeding_items=exceeding_filesets,
                                      dry_run=opts.options.dry_run)
            notify_exceeding_users(gpfs=gpfs,
                                   storage=storage,
                                   filesystem=filesystem,
                                   exceeding_items=exceeding_users,
                                   dry_run=opts.options.dry_run)

        sys.exit(1)

    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        if not opts.options.dry_run:
            nagios_reporter.cache(NAGIOS_EXIT_CRITICAL, NagiosResult("CRITICAL script failed - %s" % (err.message)))
        if not opts.options.dry_run:
            lockfile.release()
        sys.exit(1)
예제 #17
0
    def test_threshold(self, message="Hello"):
        """Test the threshold borking mechanism in the reporter."""
        message = message.rstrip()
        threshold = 1
        if message == '':
            return

        (handle, filename) = tempfile.mkstemp()
        os.unlink(filename)
        reporter = NagiosReporter('test_cache', filename, threshold,
                                  self.nagios_user)

        # redirect stdout
        old_stdout = sys.stdout
        buff = StringIO()
        sys.stdout = buff

        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        raised_exception = None
        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit as err:
            raised_exception = err
        self.assertEqual(raised_exception.code, NAGIOS_EXIT_OK[0],
                         "Exit with status when the cached data is recent")
        # restore stdout
        buff.close()
        sys.stdout = old_stdout

        reporter = NagiosReporter('test_cache', filename, threshold,
                                  self.nagios_user)
        reporter.cache(nagios_exit, message)
        time.sleep(threshold + 1)
        # redirect stdout
        old_stdout = sys.stdout
        buff = StringIO()
        sys.stdout = buff

        raised_exception = None
        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold,
                                           self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit as err:
            raised_exception = err

        line = buff.getvalue().rstrip()
        # restore stdout
        buff.close()
        sys.stdout = old_stdout
        self.assertEqual(raised_exception.code, NAGIOS_EXIT_UNKNOWN[0],
                         "Too old caches lead to unknown status")
        self.assertTrue(
            line.startswith(
                "%s test_cache gzipped JSON file too old (timestamp =" %
                (NAGIOS_EXIT_UNKNOWN[1])))

        os.unlink(filename)
예제 #18
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        "nagios": ("print out nagios information", None, "store_true", False, "n"),
        "nagios_check_filename": (
            "filename of where the nagios check data is stored",
            str,
            "store",
            NAGIOS_CHECK_FILENAME,
        ),
        "nagios_check_interval_threshold": (
            "threshold of nagios checks timing out",
            None,
            "store",
            NAGIOS_CHECK_INTERVAL_THRESHOLD,
        ),
        "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []),
        "location": ("the location for storing the pickle file: home, scratch", str, "store", "home"),
        "ha": ("high-availability master IP address", None, "store", None),
        "dry-run": ("do not make any updates whatsoever", None, "store_true", False),
    }

    opts = simple_option(options)

    if opts.options.debug:
        fancylogger.setLogLevelDebug()

    nagios_reporter = NagiosReporter(
        NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold
    )
    if opts.options.nagios:
        logger.debug("Producing Nagios report and exiting.")
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    lockfile = TimestampedPidLockfile(DCHECKJOB_LOCK_FILE)
    lock_or_bork(lockfile, nagios_reporter)

    logger.info("Starting dcheckjob")

    LdapQuery(VscConfiguration())

    clusters = {}
    for host in opts.options.hosts:
        master = opts.configfile_parser.get(host, "master")
        checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
        clusters[host] = {"master": master, "path": checkjob_path}

    checkjob = Checkjob(clusters, cache_pickle=True, dry_run=True)

    (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information()
    timeinfo = time.time()

    active_users = job_information.keys()

    logger.debug("Active users: %s" % (active_users))
    logger.debug("Checkjob information: %s" % (job_information))

    nagios_user_count = 0
    nagios_no_store = 0

    for user in active_users:
        if not opts.options.dry_run:
            try:
                (path, store) = get_pickle_path(opts.options.location, user)
                user_queue_information = CheckjobInfo({user: job_information[user]})
                store(user, path, (timeinfo, user_queue_information))
                nagios_user_count += 1
            except (UserStorageError, FileStoreError, FileMoveError), _:
                logger.error("Could not store pickle file for user %s" % (user))
                nagios_no_store += 1
        else:
            logger.info(
                "Dry run, not actually storing data for user %s at path %s"
                % (user, get_pickle_path(opts.options.location, user)[0])
            )
            logger.debug("Dry run, queue information for user %s is %s" % (user, job_information[user]))
예제 #19
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        "nagios": ("print out nagion information", None, "store_true", False, "n"),
        "nagios_check_filename": (
            "filename of where the nagios check data is stored",
            str,
            "store",
            NAGIOS_CHECK_FILENAME,
        ),
        "nagios_check_interval_threshold": (
            "threshold of nagios checks timing out",
            None,
            "store",
            NAGIOS_CHECK_INTERVAL_THRESHOLD,
        ),
        "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []),
        "information": ("the sort of information to store: user, vo, project", None, "store", "user"),
        "location": ("the location for storing the pickle file: gengar, muk", str, "store", "gengar"),
        "ha": ("high-availability master IP address", None, "store", None),
        "dry-run": ("do not make any updates whatsoever", None, "store_true", False),
    }

    opts = simple_option(options)

    if opts.options.debug:
        fancylogger.setLogLevelDebug()

    nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD)
    if opts.options.nagios:
        logger.debug("Producing Nagios report and exiting.")
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    lockfile = TimestampedPidLockfile(DSHOWQ_LOCK_FILE)
    lock_or_bork(lockfile, nagios_reporter)

    logger.info("starting dshowq run")

    clusters = {}
    for host in opts.options.hosts:
        master = opts.configfile_parser.get(host, "master")
        showq_path = opts.configfile_parser.get(host, "showq_path")
        clusters[host] = {"master": master, "path": showq_path}

    showq = Showq(clusters, cache_pickle=True, dry_run=opts.options.dry_run)

    (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information()
    timeinfo = time.time()

    active_users = queue_information.keys()

    logger.debug("Active users: %s" % (active_users))
    logger.debug("Queue information: %s" % (queue_information))

    # We need to determine which users should get an updated pickle. This depends on
    # - the active user set
    # - the information we want to provide on the cluster(set) where this script runs
    # At the same time, we need to determine the job information each user gets to see
    (target_users, target_queue_information, user_map) = determine_target_information(
        opts.options.information, active_users, queue_information
    )

    nagios_user_count = 0
    nagios_no_store = 0

    LdapQuery(VscConfiguration())

    for user in target_users:
        if not opts.options.dry_run:
            try:
                (path, store) = get_pickle_path(opts.options.location, user)
                user_queue_information = target_queue_information[user]
                user_queue_information["timeinfo"] = timeinfo
                store(user, path, (user_queue_information, user_map[user]))
                nagios_user_count += 1
            except (UserStorageError, FileStoreError, FileMoveError), err:
                logger.error("Could not store pickle file for user %s" % (user))
                nagios_no_store += 1
        else:
            logger.info(
                "Dry run, not actually storing data for user %s at path %s"
                % (user, get_pickle_path(opts.options.location, user)[0])
            )
            logger.debug("Dry run, queue information for user %s is %s" % (user, target_queue_information[user]))
예제 #20
0
        nagios_exit = NAGIOS_EXIT_OK
        reporter.cache(nagios_exit, message)
        os.close(handle)

        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            pass
        self.assertEqual(err.code, NAGIOS_EXIT_OK[0],
                         "Exit with status when the cached data is recent")
        # restore stdout
        buff.close()
        sys.stdout = old_stdout

        reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
        reporter.cache(nagios_exit, message)
        time.sleep(threshold + 1)
        # redirect stdout
        old_stdout = sys.stdout
        buff = StringIO.StringIO()
        sys.stdout = buff
        try:
            reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user)
            reporter_test.report_and_exit()
        except SystemExit, err:
            pass

        line = buff.getvalue().rstrip()
        # restore stdout
        buff.close()
예제 #21
0
def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios': ('print out nagios information', None, 'store_true', False, 'n'),
        'nagios-check-filename': ('filename of where the nagios check data is stored',
                                  str,
                                  'store',
                                  NAGIOS_CHECK_FILENAME),
        'nagios-check-interval-threshold': ('threshold of nagios checks timing out',
                                            None,
                                            'store',
                                            NAGIOS_CHECK_INTERVAL_THRESHOLD),
        'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
        'ha': ('high-availability master IP address', None, 'store', None),
        'dry-run': ('do not make any updates whatsoever', None, 'store_true', False),
    }

    opts = simple_option(options)

    nagios_reporter = NagiosReporter(NAGIOS_HEADER,
                                     opts.options.nagios_check_filename,
                                     opts.options.nagios_check_interval_threshold)
    if opts.options.nagios:
        logger.debug("Producing Nagios report and exiting.")
        nagios_reporter.report_and_exit()
        sys.exit(0)  # not reached

    if not proceed_on_ha_service(opts.options.ha):
        logger.warning("Not running on the target host in the HA setup. Stopping.")
        nagios_reporter.cache(NAGIOS_EXIT_WARNING,
                              NagiosResult("Not running on the HA master."))
        sys.exit(NAGIOS_EXIT_WARNING)

    lockfile = TimestampedPidLockfile(QUOTA_LOG_LOCK_FILE)
    lock_or_bork(lockfile, nagios_reporter)

    logger.info("starting quota_log run")

    filesystem_error = 0
    filesystem_ok = 0
    error = False

    try:
        gpfs = GpfsOperations()
        quota = gpfs.list_quota()

        for key in quota:
            try:
                filename = "gpfs_quota_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]))
                zipfile.close()
                filesystem_ok += 1
                logger.info("Stored quota information for FS %s" % (key))
            except Exception, err:
                logger.exception("Failed storing quota information for FS %s" % (key))
                filesystem_error += 1
    except Exception, err:
        logger.exception("Failure obtaining GPFS quota")
        error = True