def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) stats = {} try: gpfs = GpfsOperations() filesets = gpfs.list_filesets() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem,)] = INODE_STORE_LOG_CRITICAL try: filename = "gpfs_inodes_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem,)] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem]) logger.info("Processed inodes information for filesystem %s" % (filesystem,)) if cfs: critical_filesets[filesystem] = cfs logger.info("Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem,)] = 1 logger.exception("Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets,)) if critical_filesets: mail_admins(critical_filesets, opts.options.dry_run) except Exception: logger.exception("Failure obtaining GPFS inodes") opts.critical("Failure to obtain GPFS inodes information") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("Logged GPFS inodes", stats)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), 'backend': ('Storage backend', None, 'store', 'gpfs'), } opts = ExtendedSimpleOption(options) stats = {} backend = opts.options.backend try: if backend == 'gpfs': storage_backend = GpfsOperations() elif backend == 'lustre': storage_backend = LustreOperations() else: logger.exception("Backend %s not supported", backend) quota = storage_backend.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) for key in quota: stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL try: filename = "%s_quota_%s_%s.gz" % ( backend, time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key]).encode()) zipfile.close() stats["%s_quota_log" % (key, )] = 0 logger.info("Stored quota information for FS %s", key) except Exception: stats["%s_quota_log" % (key, )] = 1 logger.exception("Failed storing quota information for FS %s", key) except Exception: logger.exception("Failure obtaining %s quota", backend) opts.critical("Failure to obtain %s quota information" % backend) opts.epilogue("Logged %s quota" % backend, stats)
def main(): """ Set the options and initiates the main run. Returns the errors if any in a nagios/icinga friendly way. """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'create-cache': ('Create the Lmod cache', None, 'store_true', False), 'freshness-threshold': ('The interval in minutes for how long we consider the cache to be fresh', 'int', 'store', 120), } opts = ExtendedSimpleOption(options) try: if opts.options.create_cache: opts.log.info("Updating the Lmod cache") exitcode, msg = run_cache_create() if exitcode != 0: opts.log.error("Lmod cache update failed: %s", msg) opts.critical("Lmod cache update failed") try: convert_lmod_cache_to_json() except Exception as err: opts.log.exception("Lmod to JSON failed: %s", err) opts.critical("Lmod to JSON failed.") opts.log.info("Checking the Lmod cache freshness") timestamp = os.stat(get_lmod_conf()['timestamp']) # give a warning when the cache is older then --freshness-threshold if (time.time() - timestamp.st_mtime) > opts.options.freshness_threshold * 60: errmsg = "Lmod cache is not fresh" opts.log.warn(errmsg) opts.warning(errmsg) except RuntimeError as err: opts.log.exception("Failed to update Lmod cache: %s", err) opts.critical("Failed to update Lmod cache. See logs.") except Exception as err: # pylint: disable=W0703 opts.log.exception("critical exception caught: %s", err) opts.critical("Script failed because of uncaught exception. See logs.") if opts.options.create_cache: opts.epilogue("Lmod cache updated.") else: opts.epilogue("Lmod cache is still fresh.")
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) stats = {} try: gpfs = GpfsOperations() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0755) for key in quota: stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL try: filename = "gpfs_quota_%s_%s.gz" % ( time.strftime("%Y%m%d-%H:%M"), key) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(quota[key])) zipfile.close() stats["%s_quota_log" % (key, )] = 0 logger.info("Stored quota information for FS %s" % (key)) except Exception: stats["%s_quota_log" % (key, )] = 1 logger.exception("Failed storing quota information for FS %s" % (key)) except Exception: logger.exception("Failure obtaining GPFS quota") opts.critical("Failure to obtain GPFS quota information") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("Logged GPFS quota", stats)
def main(): """ Main script. - process the users and VOs - write the new timestamp if everything went OK - write the nagios check file """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('storage systems on which to deploy users and vos', None, 'extend', []), } opts = ExtendedSimpleOption(options) stats = {} try: storage_settings = VscStorage() gpfs = GpfsOperations() gpfs.list_filesystems() gpfs.list_filesets() for storage_name in opts.options.storage: filesystem_name = storage_settings[storage_name].filesystem filesystem_info = gpfs.get_filesystem_info(filesystem_name) set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name, vo_support=True, dry_run=opts.options.dry_run) except Exception as err: logging.exception("critical exception caught: %s", err) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("UGent users and VOs synchronised", stats)
def main(): """ Main script. - build the filter - fetches the users - process the users - write the new timestamp if everything went OK - write the nagios check file """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('storage systems on which to deploy users and vos', None, 'extend', []), 'user': ('process users', None, 'store_true', False), 'vo': ('process vos', None, 'store_true', False), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) stats = {} try: now = datetime.utcnow() client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") try: last_timestamp = read_timestamp(SYNC_TIMESTAMP_FILENAME) except Exception: logger.exception("Something broke reading the timestamp from %s" % SYNC_TIMESTAMP_FILENAME) last_timestamp = "200901010000Z" logger.info("Last recorded timestamp was %s" % (last_timestamp)) last_timestamp = convert_to_unix_timestamp(last_timestamp) (users_ok, users_fail) = ([], []) (quota_ok, quota_fail) = ([], []) if opts.options.user: ugent_changed_accounts = client.account.institute['gent'].modified[ last_timestamp].get()[1] logger.info( "Found %d UGent accounts that have changed in the accountpage since %s" % (len(ugent_changed_accounts), last_timestamp)) ugent_accounts = [u['vsc_id'] for u in ugent_changed_accounts] ugent_accounts = nub(ugent_accounts) for storage_name in opts.options.storage: (users_ok, users_fail) = process_users(opts.options, ugent_accounts, storage_name, client, opts.options.host_institute) stats["%s_users_sync" % (storage_name, )] = len(users_ok) stats["%s_users_sync_fail" % (storage_name, )] = len(users_fail) stats["%s_users_sync_fail_warning" % (storage_name, )] = STORAGE_USERS_LIMIT_WARNING stats["%s_users_sync_fail_critical" % (storage_name, )] = STORAGE_USERS_LIMIT_CRITICAL for storage_name in opts.options.storage: storage_changed_quota = [ mkVscUserSizeQuota(q) for q in client.quota.user. storage[storage_name].modified[last_timestamp].get()[1] ] storage_changed_quota = [ q for q in storage_changed_quota if q.fileset.startswith('vsc') ] logger.info( "Found %d accounts that have changed quota on storage %s in the accountpage since %s", len(storage_changed_quota), storage_name, last_timestamp) (quota_ok, quota_fail) = process_users_quota( opts.options, storage_changed_quota, storage_name, client, opts.options.host_institute) stats["%s_quota_sync" % (storage_name, )] = len(quota_ok) stats["%s_quota_sync_fail" % (storage_name, )] = len(quota_fail) stats["%s_quota_sync_fail_warning" % (storage_name, )] = STORAGE_QUOTA_LIMIT_WARNING stats["%s_quota_sync_fail_critical" % (storage_name, )] = STORAGE_QUOTA_LIMIT_CRITICAL (vos_ok, vos_fail) = ([], []) if opts.options.vo: ugent_changed_vos = client.vo.modified[last_timestamp].get()[1] ugent_changed_vo_quota = client.quota.vo.modified[ last_timestamp].get()[1] ugent_vos = sorted( set([v['vsc_id'] for v in ugent_changed_vos] + [ v['virtual_organisation'] for v in ugent_changed_vo_quota ])) logger.info( "Found %d UGent VOs that have changed in the accountpage since %s" % (len(ugent_changed_vos), last_timestamp)) logger.info( "Found %d UGent VOs that have changed quota in the accountpage since %s" % (len(ugent_changed_vo_quota), last_timestamp)) logger.debug( "Found the following UGent VOs: {vos}".format(vos=ugent_vos)) for storage_name in opts.options.storage: (vos_ok, vos_fail) = process_vos(opts.options, ugent_vos, storage_name, client, last_timestamp, opts.options.host_institute) stats["%s_vos_sync" % (storage_name, )] = len(vos_ok) stats["%s_vos_sync_fail" % (storage_name, )] = len(vos_fail) stats["%s_vos_sync_fail_warning" % (storage_name, )] = STORAGE_VO_LIMIT_WARNING stats["%s_vos_sync_fail_critical" % (storage_name, )] = STORAGE_VO_LIMIT_CRITICAL if not (users_fail or quota_fail or vos_fail): (_, ldap_timestamp) = convert_timestamp(now) if not opts.options.dry_run: write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("UGent users and VOs synchronised", stats)
def main(): """ Main script. The usual. """ options = { "nagios-check-interval-threshold": NAGIOS_CHECK_INTERVAL_THRESHOLD, "access_token": ("OAuth2 token to access the account page REST API", None, "store", None), "account_page_url": ( "URL of the account page where we can find the REST API", str, "store", "https://apivsc.ugent.be/django", ), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), "clusters": ( "Cluster(s) (comma-separated) to sync for. " "Overrides <host_institute>_SLURM_COMPUTE_CLUSTERS that are in production.", "strlist", "store", [], ), 'start_timestamp': ('Timestamp to start the sync from', str, 'store', None), 'cluster_classes': ('Classes of clusters that should be synced, comma-separated', "strlist", 'store', [PRODUCTION, PILOT]) } opts = ExtendedSimpleOption(options) stats = {} (last_timestamp, start_time) = retrieve_timestamp_with_default( SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp) logging.info("Using timestamp %s", last_timestamp) logging.info("Using startime %s", start_time) try: client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") host_institute = opts.options.host_institute slurm_account_info = get_slurm_acct_info(SyncTypes.accounts) slurm_user_info = get_slurm_acct_info(SyncTypes.users) logging.debug("%d accounts found", len(slurm_account_info)) logging.debug("%d users found", len(slurm_user_info)) if opts.options.clusters: clusters = opts.options.clusters else: clusters = [ cs for p in opts.options.cluster_classes for cs in VSC_SLURM_CLUSTERS[host_institute][p] ] sacctmgr_commands = [] # All users belong to a VO, so fetching the VOs is necessary/ account_page_vos = [ mkVo(v) for v in client.vo.institute[opts.options.host_institute].get()[1] ] # make sure the institutes and the default accounts (VOs) are there for each cluster institute_vos = dict([ (v.vsc_id, v) for v in account_page_vos if v.vsc_id in INSTITUTE_VOS_BY_INSTITUTE[host_institute].values() ]) sacctmgr_commands += slurm_institute_accounts(slurm_account_info, clusters, host_institute, institute_vos) # The VOs do not track active state of users, so we need to fetch all accounts as well active_accounts = set( [a["vsc_id"] for a in client.account.get()[1] if a["isactive"]]) # dictionary mapping the VO vsc_id on a tuple with the VO members and the VO itself account_page_members = dict([(vo.vsc_id, (set(vo.members), vo)) for vo in account_page_vos]) # process all regular VOs sacctmgr_commands += slurm_vo_accounts(account_page_vos, slurm_account_info, clusters, host_institute) # process VO members sacctmgr_commands += slurm_user_accounts(account_page_members, active_accounts, slurm_user_info, clusters, opts.options.dry_run) logging.info("Executing %d commands", len(sacctmgr_commands)) if opts.options.dry_run: print("Commands to be executed:\n") print("\n".join([" ".join(c) for c in sacctmgr_commands])) else: execute_commands(sacctmgr_commands) if not opts.options.dry_run: (_, ldap_timestamp) = convert_timestamp(start_time) write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp) opts.epilogue("Accounts synced to slurm", stats) else: logging.info("Dry run done") except Exception as err: logging.exception("critical exception caught: %s", err) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'start-timestamp': ("The timestamp form which to start, otherwise use the cached value", None, "store", None), 'access_token': ('OAuth2 token identifying the user with the accountpage', None, 'store', None), 'account_page_url': ('url for the account page', None, 'store', None), 'start_timestamp': ('Timestamp to start the sync from', str, 'store', None), } # get access_token from conf file ExtendedSimpleOption.CONFIGFILES_INIT = ['/etc/account_page.conf'] opts = ExtendedSimpleOption(options) stats = {} # Creating this here because this is a singleton class _ = LdapQuery(VscConfiguration(VSC_CONF_DEFAULT_FILENAME)) (last_timestamp, start_time) = retrieve_timestamp_with_default( SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp) logging.info("Using timestamp %s", last_timestamp) logging.info("Using startime %s", start_time) try: parent_pid = os.fork() logging.info("Forked.") except OSError: logging.exception("Could not fork") parent_pid = 1 except Exception: logging.exception("Oops") parent_pid = 1 if parent_pid == 0: try: global logger logger = fancylogger.getLogger(NAGIOS_HEADER) # drop privileges in the child try: apache_uid = pwd.getpwnam('apache').pw_uid apache_gid = grp.getgrnam('apache').gr_gid os.setgroups([]) os.setgid(apache_gid) os.setuid(apache_uid) logging.info("Now running as %s" % (os.geteuid(), )) except OSError: logger.raiseException("Could not drop privileges") client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + '/api/') syncer = LdapSyncer(client) last = last_timestamp altered_accounts = syncer.sync_altered_accounts( last, opts.options.dry_run) logging.debug("Altered accounts: %s", altered_accounts) altered_groups = syncer.sync_altered_groups( last, opts.options.dry_run) logging.debug("Altered groups: %s" % altered_groups) if not altered_accounts[ERROR] \ and not altered_groups[ERROR]: logging.info("Child process exiting correctly") sys.exit(0) else: logging.info("Child process exiting with status -1") logging.warning("Error occured in %s" % ([ "%s: %s\n" % (k, v) for (k, v) in [ ("altered accounts", altered_accounts[ERROR]), ("altered groups", altered_groups[ERROR]), ] ])) sys.exit(-1) except Exception: logging.exception("Child caught an exception") sys.exit(-1) else: # parent (_, result) = os.waitpid(parent_pid, 0) logging.info("Child exited with exit code %d" % (result, )) if not result and not opts.options.dry_run: (_, ldap_timestamp) = convert_timestamp(start_time) write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp) opts.epilogue("Synchronised LDAP users to the Django DB", stats) else: sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'start-timestamp': ("The timestamp form which to start, otherwise use the cached value", None, "store", None), 'access_token': ('OAuth2 token identifying the user with the accountpage', None, 'store', None), 'account_page_url': ('url for the account page', None, 'store', None), } # get access_token from conf file ExtendedSimpleOption.CONFIGFILES_INIT = ['/etc/account_page.conf'] opts = ExtendedSimpleOption(options) stats = {} # Creating this here because this is a singleton class _ = LdapQuery(VscConfiguration(VSC_CONF_DEFAULT_FILENAME)) last_timestamp = opts.options.start_timestamp if not last_timestamp: try: last_timestamp = read_timestamp(SYNC_TIMESTAMP_FILENAME) except Exception: _log.warning("Something broke reading the timestamp from %s", SYNC_TIMESTAMP_FILENAME) last_timestamp = "201710230000Z" _log.warning( "We will resync from a hardcoded know working sync a while back : %s", last_timestamp) _log.info("Using timestamp %s", last_timestamp) # record starttime before starting, and take a 10 sec safety buffer so we don't get gaps where users are approved # in between the requesting of modified users and writing out the start time start_time = datetime.datetime.now() + datetime.timedelta(seconds=-10) _log.info("startime %s", start_time) try: parent_pid = os.fork() _log.info("Forked.") except OSError: _log.exception("Could not fork") parent_pid = 1 except Exception: _log.exception("Oops") parent_pid = 1 if parent_pid == 0: try: global _log _log = fancylogger.getLogger(NAGIOS_HEADER) # drop privileges in the child try: apache_uid = pwd.getpwnam('apache').pw_uid apache_gid = grp.getgrnam('apache').gr_gid os.setgroups([]) os.setgid(apache_gid) os.setuid(apache_uid) _log.info("Now running as %s" % (os.geteuid(), )) except OSError: _log.raiseException("Could not drop privileges") client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + '/api/') syncer = LdapSyncer(client) last = int( (datetime.datetime.strptime(last_timestamp, "%Y%m%d%H%M%SZ") - datetime.datetime(1970, 1, 1)).total_seconds()) altered_accounts = syncer.sync_altered_accounts( last, opts.options.dry_run) _log.debug("Altered accounts: %s", altered_accounts) altered_groups = syncer.sync_altered_groups( last, opts.options.dry_run) _log.debug("Altered groups: %s" % altered_groups) if not altered_accounts[ERROR] \ and not altered_groups[ERROR]: _log.info("Child process exiting correctly") sys.exit(0) else: _log.info("Child process exiting with status -1") _log.warning("Error occured in %s" % ([ "%s: %s\n" % (k, v) for (k, v) in [ ("altered accounts", altered_accounts[ERROR]), ("altered groups", altered_groups[ERROR]), ] ])) sys.exit(-1) except Exception: _log.exception("Child caught an exception") sys.exit(-1) else: # parent (_, result) = os.waitpid(parent_pid, 0) _log.info("Child exited with exit code %d" % (result, )) if not result: if not opts.options.start_timestamp: (_, ldap_timestamp) = convert_timestamp(start_time) if not opts.options.dry_run: write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp) else: _log.info( "Not updating the timestamp, since one was provided on the command line" ) opts.epilogue("Synchronised LDAP users to the Django DB", stats) else: _log.info( "Not updating the timestamp, since it was given on the command line for this run" ) sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): """ Main script. - build the filter - fetches the users - process the users - write the new timestamp if everything went OK - write the nagios check file """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('storage systems on which to deploy users and vos', None, 'extend', []), 'user': ('process users', None, 'store_true', False), 'vo': ('process vos', None, 'store_true', False), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'account_page_url': ('URL of the account page where we can find the REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), 'start_timestamp': ('Timestamp to start the sync from', str, 'store', None), } opts = ExtendedSimpleOption(options) stats = {} (last_timestamp, start_time) = retrieve_timestamp_with_default( SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp) logging.info("Using timestamp %s", last_timestamp) logging.info("Using startime %s", start_time) try: client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") institute = opts.options.host_institute (users_ok, users_fail) = ([], []) (quota_ok, quota_fail) = ([], []) if opts.options.user: changed_accounts = client.account.institute[institute].modified[ last_timestamp].get()[1] logging.info( "Found %d %s accounts that have changed in the accountpage since %s" % (len(changed_accounts), institute, last_timestamp)) accounts = nub([u['vsc_id'] for u in changed_accounts]) for storage_name in opts.options.storage: (users_ok, users_fail) = process_users(opts.options, accounts, storage_name, client, institute) stats["%s_users_sync" % (storage_name, )] = len(users_ok) stats["%s_users_sync_fail" % (storage_name, )] = len(users_fail) stats["%s_users_sync_fail_warning" % (storage_name, )] = STORAGE_USERS_LIMIT_WARNING stats["%s_users_sync_fail_critical" % (storage_name, )] = STORAGE_USERS_LIMIT_CRITICAL for storage_name in opts.options.storage: storage_changed_quota = [ mkVscUserSizeQuota(q) for q in client.quota.user. storage[storage_name].modified[last_timestamp].get()[1] ] storage_changed_quota = [ q for q in storage_changed_quota if q.fileset.startswith('vsc') ] logging.info( "Found %d accounts that have changed quota on storage %s in the accountpage since %s", len(storage_changed_quota), storage_name, last_timestamp) (quota_ok, quota_fail) = process_users_quota( opts.options, storage_changed_quota, storage_name, client, institute) stats["%s_quota_sync" % (storage_name, )] = len(quota_ok) stats["%s_quota_sync_fail" % (storage_name, )] = len(quota_fail) stats["%s_quota_sync_fail_warning" % (storage_name, )] = STORAGE_QUOTA_LIMIT_WARNING stats["%s_quota_sync_fail_critical" % (storage_name, )] = STORAGE_QUOTA_LIMIT_CRITICAL (vos_ok, vos_fail) = ([], []) if opts.options.vo: changed_vos = client.vo.institute[institute].modified[ last_timestamp].get()[1] changed_vo_quota = client.quota.vo.modified[last_timestamp].get( )[1] vos = sorted( set([v['vsc_id'] for v in changed_vos] + [v['virtual_organisation'] for v in changed_vo_quota])) logging.info( "Found %d %s VOs that have changed in the accountpage since %s" % (len(changed_vos), institute, last_timestamp)) logging.info( "Found %d %s VOs that have changed quota in the accountpage since %s" % (len(changed_vo_quota), institute, last_timestamp)) logging.debug("Found the following {institute} VOs: {vos}".format( institute=institute, vos=vos)) for storage_name in opts.options.storage: (vos_ok, vos_fail) = process_vos(opts.options, vos, storage_name, client, last_timestamp, institute) stats["%s_vos_sync" % (storage_name, )] = len(vos_ok) stats["%s_vos_sync_fail" % (storage_name, )] = len(vos_fail) stats["%s_vos_sync_fail_warning" % (storage_name, )] = STORAGE_VO_LIMIT_WARNING stats["%s_vos_sync_fail_critical" % (storage_name, )] = STORAGE_VO_LIMIT_CRITICAL if not (users_fail or quota_fail or vos_fail) and not opts.options.dry_run: (_, ldap_timestamp) = convert_timestamp(start_time) write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("%s users and VOs synchronised" % institute, stats)
def main(): """Main script""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'write-cache': ('Write the data into the cache files in the FS', None, 'store_true', False), 'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log try: client = AccountpageClient(token=opts.options.access_token) user_id_map = map_uids_to_names() # is this really necessary? gpfs = GpfsOperations() storage = VscStorage() target_filesystems = [ storage[s].filesystem for s in opts.options.storage ] filesystems = gpfs.list_filesystems(device=target_filesystems).keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets(devices=target_filesystems) logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota(devices=target_filesystems) exceeding_filesets = {} exceeding_users = {} stats = {} for storage_name in opts.options.storage: logger.info("Processing quota for storage_name %s" % (storage_name)) filesystem = storage[storage_name].filesystem replication_factor = storage[storage_name].data_replication_factor if filesystem not in filesystems: logger.error("Non-existent filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem)) continue quota_storage_map = get_mmrepquota_maps( quota[filesystem], storage_name, filesystem, filesets, replication_factor, ) exceeding_filesets[storage_name] = process_fileset_quota( storage, gpfs, storage_name, filesystem, quota_storage_map['FILESET'], client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) exceeding_users[storage_name] = process_user_quota( storage, gpfs, storage_name, None, quota_storage_map['USR'], user_id_map, client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) stats["%s_fileset_critical" % (storage_name, )] = QUOTA_FILESETS_CRITICAL if exceeding_filesets[storage_name]: stats["%s_fileset" % (storage_name, )] = 1 logger.warning( "storage_name %s found %d filesets that are exceeding their quota", storage_name, len(exceeding_filesets)) for (e_fileset, e_quota) in exceeding_filesets[storage_name]: logger.warning("%s has quota %s" % (e_fileset, str(e_quota))) else: stats["%s_fileset" % (storage_name, )] = 0 logger.debug( "storage_name %s found no filesets that are exceeding their quota" % storage_name) stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING stats["%s_users_critical" % (storage_name, )] = QUOTA_USERS_CRITICAL if exceeding_users[storage_name]: stats["%s_users" % (storage_name, )] = len( exceeding_users[storage_name]) logger.warning( "storage_name %s found %d users who are exceeding their quota" % (storage_name, len(exceeding_users[storage_name]))) for (e_user_id, e_quota) in exceeding_users[storage_name]: logger.warning("%s has quota %s" % (e_user_id, str(e_quota))) else: stats["%s_users" % (storage_name, )] = 0 logger.debug( "storage_name %s found no users who are exceeding their quota" % storage_name) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") opts.epilogue("quota check completed", stats)
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), } opts = ExtendedSimpleOption(options) logger = opts.log stats = {} try: gpfs = GpfsOperations() filesets = gpfs.list_filesets() quota = gpfs.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem, )] = INODE_STORE_LOG_CRITICAL try: filename = "gpfs_inodes_%s_%s.gz" % ( time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem, )] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem], quota[filesystem]['FILESET'], threshold=0.9) logger.info("Processed inodes information for filesystem %s" % (filesystem, )) if cfs: critical_filesets[filesystem] = cfs logger.info( "Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem, )] = 1 logger.exception( "Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets, )) if critical_filesets: mail_admins(critical_filesets, opts.options.dry_run) except Exception: logger.exception("Failure obtaining GPFS inodes") opts.critical("Failure to obtain GPFS inodes information") opts.epilogue("Logged GPFS inodes", stats)
def main(): """ Main script. The usual. """ options = { "nagios-check-interval-threshold": NAGIOS_CHECK_INTERVAL_THRESHOLD, "access_token": ("OAuth2 token to access the account page REST API", None, "store", None), "account_page_url": ( "URL of the account page where we can find the REST API", str, "store", "https://apivsc.ugent.be/django", ), "clusters": ( "Cluster(s) (comma-separated) to sync for. " "Overrides GENT_SLURM_COMPUTE_CLUSTERS that are in production.", str, "store", None, ), } opts = ExtendedSimpleOption(options) stats = {} try: client = AccountpageClient(token=opts.options.access_token, url=opts.options.account_page_url + "/api/") last_timestamp = "201804010000Z" # the beginning of time logging.info("Last recorded timestamp was %s" % (last_timestamp)) slurm_account_info = get_slurm_acct_info(SyncTypes.accounts) slurm_user_info = get_slurm_acct_info(SyncTypes.users) logging.debug("%d accounts found", len(slurm_account_info)) logging.debug("%d users found", len(slurm_user_info)) if opts.options.clusters is not None: clusters = opts.options.clusters.split(",") else: clusters = [ c for c in GENT_SLURM_COMPUTE_CLUSTERS if c in GENT_PRODUCTION_COMPUTE_CLUSTERS ] sacctmgr_commands = [] # make sure the institutes and the default accounts (VOs) are there for each cluster sacctmgr_commands += slurm_institute_accounts(slurm_account_info, clusters) # All users belong to a VO, so fetching the VOs is necessary/ account_page_vos = [mkVo(v) for v in client.vo.get()[1]] # The VOs do not track active state of users, so we need to fetch all accounts as well active_accounts = set( [a["vsc_id"] for a in client.account.get()[1] if a["isactive"]]) # dictionary mapping the VO vsc_id on a tuple with the VO members and the VO itself account_page_members = dict([(vo.vsc_id, (set(vo.members), vo)) for vo in account_page_vos]) # process all regular VOs sacctmgr_commands += slurm_vo_accounts(account_page_vos, slurm_account_info, clusters) # process VO members sacctmgr_commands += slurm_user_accounts(account_page_members, active_accounts, slurm_user_info, clusters, opts.options.dry_run) logging.info("Executing %d commands", len(sacctmgr_commands)) if opts.options.dry_run: print("Commands to be executed:\n") print("\n".join([" ".join(c) for c in sacctmgr_commands])) else: execute_commands(sacctmgr_commands) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) if not opts.options.dry_run: opts.epilogue("Accounts synced to slurm", stats) else: logger.info("Dry run done")
def main(): """The main.""" # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH), 'backend': ('Storage backend', None, 'store', 'gpfs'), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log stats = {} backend = opts.options.backend try: if backend == 'gpfs': storage_backend = GpfsOperations() elif backend == 'lustre': storage_backend = LustreOperations() else: logger.exception("Backend %s not supported" % backend) filesets = storage_backend.list_filesets() quota = storage_backend.list_quota() if not os.path.exists(opts.options.location): os.makedirs(opts.options.location, 0o755) critical_filesets = dict() for filesystem in filesets: stats["%s_inodes_log_critical" % (filesystem, )] = INODE_STORE_LOG_CRITICAL try: filename = "%s_inodes_%s_%s.gz" % ( backend, time.strftime("%Y%m%d-%H:%M"), filesystem) path = os.path.join(opts.options.location, filename) zipfile = gzip.open(path, 'wb', 9) # Compress to the max zipfile.write(json.dumps(filesets[filesystem])) zipfile.close() stats["%s_inodes_log" % (filesystem, )] = 0 logger.info("Stored inodes information for FS %s" % (filesystem)) cfs = process_inodes_information(filesets[filesystem], quota[filesystem]['FILESET'], threshold=0.9, storage=backend) logger.info("Processed inodes information for filesystem %s" % (filesystem, )) if cfs: critical_filesets[filesystem] = cfs logger.info( "Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs))) except Exception: stats["%s_inodes_log" % (filesystem, )] = 1 logger.exception( "Failed storing inodes information for FS %s" % (filesystem)) logger.info("Critical filesets: %s" % (critical_filesets, )) if critical_filesets: mail_admins(critical_filesets, dry_run=opts.options.dry_run, host_institute=opts.options.host_institute) except Exception: logger.exception("Failure obtaining %s inodes" % backend) opts.critical("Failure to obtain %s inodes information" % backend) opts.epilogue("Logged %s inodes" % backend, stats)