def test_cache(self): """Test the caching""" (handle, filename) = tempfile.mkstemp() os.unlink(filename) n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user) message = "mywarning" n.warning(message) os.close(handle) self.buffo.seek(0) self.buffo.truncate(0) raised_exception = None try: reporter_test = NagiosReporter('test_cache', filename, -1, self.nagios_user) reporter_test.report_and_exit() except SystemExit as err: raised_exception = err bo = self.buffo.getvalue().rstrip() self.assertEqual(bo, "WARNING %s" % message) self.assertEqual(raised_exception.code, NAGIOS_EXIT_WARNING[0]) statres = os.stat(filename) self.assertFalse(statres.st_mode & stat.S_IROTH)
def test_cache(self): """Test the caching mechanism in the reporter.""" length = random.randint(1, 30) exit_code = random.randint(0, 3) threshold = random.randint(0, 10) message = ''.join(random.choice(string.printable) for x in range(length)) message = message.rstrip() (handle, filename) = tempfile.mkstemp() os.unlink(filename) os.close(handle) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) nagios_exit = [NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL, NAGIOS_EXIT_UNKNOWN][exit_code] reporter.cache(nagios_exit, message) (handle, output_filename) = tempfile.mkstemp() os.close(handle) try: old_stdout = sys.stdout buffer = StringIO.StringIO() sys.stdout = buffer reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: line = buffer.getvalue().rstrip() sys.stdout = old_stdout buffer.close() self.assertTrue(err.code == nagios_exit[0]) self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
def test_cache(self, exit_code, message, threshold): """Test the caching mechanism in the reporter.""" message = message.rstrip() if message == '': return (handle, filename) = tempfile.mkstemp() os.unlink(filename) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) nagios_exit = [ NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL, NAGIOS_EXIT_UNKNOWN ][exit_code] reporter.cache(nagios_exit, message) (handle, output_filename) = tempfile.mkstemp() os.close(handle) try: old_stdout = sys.stdout buffer = StringIO.StringIO() sys.stdout = buffer reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: line = buffer.getvalue().rstrip() sys.stdout = old_stdout buffer.close() self.assertTrue(err.code == nagios_exit[0]) self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
def test_threshold(self, message="Hello"): """Test the threshold borking mechanism in the reporter.""" message = message.rstrip() threshold = 1 if message == '': return (handle, filename) = tempfile.mkstemp() os.unlink(filename) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) # redirect stdout old_stdout = sys.stdout buff = StringIO.StringIO() sys.stdout = buff nagios_exit = NAGIOS_EXIT_OK reporter.cache(nagios_exit, message) os.close(handle) try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: pass
def main(args): """Main script.""" options = { 'nagios': ('print out nagion information', None, 'store_true', False, 'n'), 'nagios_check_filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME), 'nagios_check_interval_threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD), 'mail-report': ('mail a report to the hpc-admin list with job list for gracing or inactive users', None, 'store_true', False), 'ha': ('high-availability master IP address', None, 'store', None), 'dry-run': ('do not make any updates whatsoever', None, 'store_true', False), } opts = simple_option(options) nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD) if opts.options.nagios: nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) try: vsc_config = VscConfiguration() LdapQuery(vsc_config) grace_users = get_user_with_status('grace') inactive_users = get_user_with_status('inactive') pbs_query = PBSQuery() t = time.ctime() jobs = pbs_query.getjobs() # we just get them all removed_queued = remove_queued_jobs(jobs, grace_users, inactive_users, opts.options.dry_run) removed_running = remove_running_jobs(jobs, inactive_users, opts.options.dry_run) if opts.options.mail_report and not opts.options.dry_run: if len(removed_queued) > 0 or len(removed_running) > 0: mail_report(t, removed_queued, removed_running) except Exception, err: logger.exception("Something went wrong: {err}".format(err=err)) nagios_reporter.cache(NAGIOS_EXIT_CRITICAL, NagiosResult("Script failed, check log file ({logfile})".format(logfile=PBS_CHECK_LOG_FILE))) sys.exit(NAGIOS_EXIT_CRITICAL)
def test_world_readable(self): """Test world readable cache""" (handle, filename) = tempfile.mkstemp() os.unlink(filename) n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user, _world_readable=True) n.ok("test") os.close(handle) try: reporter_test = NagiosReporter('test_cache', filename, -1, self.nagios_user) reporter_test.report_and_exit() except SystemExit: pass statres = os.stat(filename) self.assertTrue(statres.st_mode & stat.S_IROTH)
def test_cache(self): """Test the caching""" (handle, filename) = tempfile.mkstemp() os.unlink(filename) n = SimpleNagios(_cache=filename, _cache_user=self.nagios_user) message = "mywarning" n.warning(message) os.close(handle) self.buffo.seek(0) self.buffo.truncate(0) try: reporter_test = NagiosReporter('test_cache', filename, -1, self.nagios_user) reporter_test.report_and_exit() except SystemExit, e: pass
def test_cache(self, exit_code, message, threshold): """Test the caching mechanism in the reporter.""" message = message.rstrip() if message == '': return (handle, filename) = tempfile.mkstemp() os.unlink(filename) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user ) nagios_exit = [NAGIOS_EXIT_OK, NAGIOS_EXIT_WARNING, NAGIOS_EXIT_CRITICAL, NAGIOS_EXIT_UNKNOWN][exit_code] reporter.cache(nagios_exit, message) (handle, output_filename) = tempfile.mkstemp() os.close(handle) try: old_stdout = sys.stdout buffer = StringIO.StringIO() sys.stdout = buffer reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: line = buffer.getvalue().rstrip() sys.stdout = old_stdout buffer.close() self.assertTrue(err.code == nagios_exit[0]) self.assertTrue(line == "%s %s" % (nagios_exit[1], message))
def test_threshold(self, message="Hello"): """Test the threshold borking mechanism in the reporter.""" message = message.rstrip() threshold = 1 if message == '': return (handle, filename) = tempfile.mkstemp() os.unlink(filename) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) nagios_exit = NAGIOS_EXIT_OK reporter.cache(nagios_exit, message) os.close(handle) try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: self.assertEqual( err.code, NAGIOS_EXIT_OK[0], "Exit with status when the cached data is recent")
def test_threshold(self, message="Hello"): """Test the threshold borking mechanism in the reporter.""" message = message.rstrip() threshold = 1 if message == '': return (handle, filename) = tempfile.mkstemp() os.unlink(filename) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) nagios_exit = NAGIOS_EXIT_OK reporter.cache(nagios_exit, message) os.close(handle) try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: self.assertEqual(err.code, NAGIOS_EXIT_OK[0], "Exit with status when the cached data is recent")
nagios_exit = NAGIOS_EXIT_OK reporter.cache(nagios_exit, message) os.close(handle) try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: pass self.assertEqual(err.code, NAGIOS_EXIT_OK[0], "Exit with status when the cached data is recent") # restore stdout buff.close() sys.stdout = old_stdout reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter.cache(nagios_exit, message) time.sleep(threshold + 1) # redirect stdout old_stdout = sys.stdout buff = StringIO.StringIO() sys.stdout = buff try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit, err: pass line = buff.getvalue().rstrip() # restore stdout buff.close()
def main(): """Main script""" options = { 'nagios': ('print out nagios information', None, 'store_true', False, 'n'), 'nagios-check-filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME), 'nagios-check-interval-threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD), 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'dry-run': ('do not make any updates whatsoever', None, 'store_true', False), } opts = simple_option(options) logger.info('started GPFS quota check run.') nagios_reporter = NagiosReporter(NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold) if opts.options.nagios: nagios_reporter.report_and_exit() sys.exit(0) # not reached lockfile = TimestampedPidLockfile(QUOTA_CHECK_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) try: user_id_map = map_uids_to_names() # is this really necessary? LdapQuery(VscConfiguration()) gpfs = GpfsOperations() filesystems = gpfs.list_filesystems().keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets() logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota() for storage in opts.options.storage: logger.info("Processing quota for storage %s" % (storage)) filesystem = opts.configfile_parser.get(storage, 'filesystem') if filesystem not in filesystems: logger.error("Non-existant filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage %s [%s]" % (storage, filesystem)) continue quota_storage_map = get_mmrepquota_maps(quota[filesystem], storage,filesystem, filesets) exceeding_filesets = process_fileset_quota(gpfs, storage, filesystem, quota_storage_map['FILESET']) exceeding_users = process_user_quota(gpfs, storage, filesystem, quota_storage_map['USR'], user_id_map) logger.warning("storage %s found %d filesets that are exceeding their quota: %s" % (storage, len(exceeding_filesets), exceeding_filesets)) logger.warning("storage %s found %d users who are exceeding their quota: %s" % (storage, len(exceeding_users), exceeding_users)) notify_exceeding_filesets(gpfs=gpfs, storage=storage, filesystem=filesystem, exceeding_items=exceeding_filesets, dry_run=opts.options.dry_run) notify_exceeding_users(gpfs=gpfs, storage=storage, filesystem=filesystem, exceeding_items=exceeding_users, dry_run=opts.options.dry_run) sys.exit(1) except Exception, err: logger.exception("critical exception caught: %s" % (err)) if not opts.options.dry_run: nagios_reporter.cache(NAGIOS_EXIT_CRITICAL, NagiosResult("CRITICAL script failed - %s" % (err.message))) if not opts.options.dry_run: lockfile.release() sys.exit(1)
def test_threshold(self, message="Hello"): """Test the threshold borking mechanism in the reporter.""" message = message.rstrip() threshold = 1 if message == '': return (handle, filename) = tempfile.mkstemp() os.unlink(filename) reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) # redirect stdout old_stdout = sys.stdout buff = StringIO() sys.stdout = buff nagios_exit = NAGIOS_EXIT_OK reporter.cache(nagios_exit, message) os.close(handle) raised_exception = None try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit as err: raised_exception = err self.assertEqual(raised_exception.code, NAGIOS_EXIT_OK[0], "Exit with status when the cached data is recent") # restore stdout buff.close() sys.stdout = old_stdout reporter = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter.cache(nagios_exit, message) time.sleep(threshold + 1) # redirect stdout old_stdout = sys.stdout buff = StringIO() sys.stdout = buff raised_exception = None try: reporter_test = NagiosReporter('test_cache', filename, threshold, self.nagios_user) reporter_test.report_and_exit() except SystemExit as err: raised_exception = err line = buff.getvalue().rstrip() # restore stdout buff.close() sys.stdout = old_stdout self.assertEqual(raised_exception.code, NAGIOS_EXIT_UNKNOWN[0], "Too old caches lead to unknown status") self.assertTrue( line.startswith( "%s test_cache gzipped JSON file too old (timestamp =" % (NAGIOS_EXIT_UNKNOWN[1]))) os.unlink(filename)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { "nagios": ("print out nagios information", None, "store_true", False, "n"), "nagios_check_filename": ( "filename of where the nagios check data is stored", str, "store", NAGIOS_CHECK_FILENAME, ), "nagios_check_interval_threshold": ( "threshold of nagios checks timing out", None, "store", NAGIOS_CHECK_INTERVAL_THRESHOLD, ), "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []), "location": ("the location for storing the pickle file: home, scratch", str, "store", "home"), "ha": ("high-availability master IP address", None, "store", None), "dry-run": ("do not make any updates whatsoever", None, "store_true", False), } opts = simple_option(options) if opts.options.debug: fancylogger.setLogLevelDebug() nagios_reporter = NagiosReporter( NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold ) if opts.options.nagios: logger.debug("Producing Nagios report and exiting.") nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) lockfile = TimestampedPidLockfile(DCHECKJOB_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) logger.info("Starting dcheckjob") LdapQuery(VscConfiguration()) clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = {"master": master, "path": checkjob_path} checkjob = Checkjob(clusters, cache_pickle=True, dry_run=True) (job_information, reported_hosts, failed_hosts) = checkjob.get_moab_command_information() timeinfo = time.time() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 for user in active_users: if not opts.options.dry_run: try: (path, store) = get_pickle_path(opts.options.location, user) user_queue_information = CheckjobInfo({user: job_information[user]}) store(user, path, (timeinfo, user_queue_information)) nagios_user_count += 1 except (UserStorageError, FileStoreError, FileMoveError), _: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 else: logger.info( "Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0]) ) logger.debug("Dry run, queue information for user %s is %s" % (user, job_information[user]))
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { "nagios": ("print out nagion information", None, "store_true", False, "n"), "nagios_check_filename": ( "filename of where the nagios check data is stored", str, "store", NAGIOS_CHECK_FILENAME, ), "nagios_check_interval_threshold": ( "threshold of nagios checks timing out", None, "store", NAGIOS_CHECK_INTERVAL_THRESHOLD, ), "hosts": ("the hosts/clusters that should be contacted for job information", None, "extend", []), "information": ("the sort of information to store: user, vo, project", None, "store", "user"), "location": ("the location for storing the pickle file: gengar, muk", str, "store", "gengar"), "ha": ("high-availability master IP address", None, "store", None), "dry-run": ("do not make any updates whatsoever", None, "store_true", False), } opts = simple_option(options) if opts.options.debug: fancylogger.setLogLevelDebug() nagios_reporter = NagiosReporter(NAGIOS_HEADER, NAGIOS_CHECK_FILENAME, NAGIOS_CHECK_INTERVAL_THRESHOLD) if opts.options.nagios: logger.debug("Producing Nagios report and exiting.") nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) lockfile = TimestampedPidLockfile(DSHOWQ_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) logger.info("starting dshowq run") clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = {"master": master, "path": showq_path} showq = Showq(clusters, cache_pickle=True, dry_run=opts.options.dry_run) (queue_information, reported_hosts, failed_hosts) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see (target_users, target_queue_information, user_map) = determine_target_information( opts.options.information, active_users, queue_information ) nagios_user_count = 0 nagios_no_store = 0 LdapQuery(VscConfiguration()) for user in target_users: if not opts.options.dry_run: try: (path, store) = get_pickle_path(opts.options.location, user) user_queue_information = target_queue_information[user] user_queue_information["timeinfo"] = timeinfo store(user, path, (user_queue_information, user_map[user])) nagios_user_count += 1 except (UserStorageError, FileStoreError, FileMoveError), err: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 else: logger.info( "Dry run, not actually storing data for user %s at path %s" % (user, get_pickle_path(opts.options.location, user)[0]) ) logger.debug("Dry run, queue information for user %s is %s" % (user, target_queue_information[user]))
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios': ('print out nagios information', None, 'store_true', False, 'n'), 'nagios-check-filename': ('filename of where the nagios check data is stored', str, 'store', NAGIOS_CHECK_FILENAME), 'nagios-check-interval-threshold': ('threshold of nagios checks timing out', None, 'store', NAGIOS_CHECK_INTERVAL_THRESHOLD), 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), 'ha': ('high-availability master IP address', None, 'store', None), 'dry-run': ('do not make any updates whatsoever', None, 'store_true', False), } opts = simple_option(options) nagios_reporter = NagiosReporter(NAGIOS_HEADER, opts.options.nagios_check_filename, opts.options.nagios_check_interval_threshold) if opts.options.nagios: logger.debug("Producing Nagios report and exiting.") nagios_reporter.report_and_exit() sys.exit(0) # not reached if not proceed_on_ha_service(opts.options.ha): logger.warning("Not running on the target host in the HA setup. Stopping.") nagios_reporter.cache(NAGIOS_EXIT_WARNING, NagiosResult("Not running on the HA master.")) sys.exit(NAGIOS_EXIT_WARNING) lockfile = TimestampedPidLockfile(QUOTA_LOG_LOCK_FILE) lock_or_bork(lockfile, nagios_reporter) logger.info("starting quota_log run") filesystem_error = 0 filesystem_ok = 0 error = False try: gpfs = GpfsOperations() quota = gpfs.list_quota() for key in quota: try: filename = "gpfs_quota_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key])) zipfile.close() filesystem_ok += 1 logger.info("Stored quota information for FS %s" % (key)) except Exception, err: logger.exception("Failed storing quota information for FS %s" % (key)) filesystem_error += 1 except Exception, err: logger.exception("Failure obtaining GPFS quota") error = True