def main(): storage_settings = VscStorage() local_storage_conf = configparser.SafeConfigParser() local_storage_conf.read(QUOTA_CONF_FILE) gpfs = GpfsOperations() gpfs.list_filesystems() gpfs.list_filesets() for storage_name in local_storage_conf.get('MAIN', 'storage').split(','): filesystem_name = storage_settings[storage_name].filesystem filesystem_info = gpfs.get_filesystem_info(filesystem_name) if storage_name in ('VSC_HOME'): set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name) set_up_apps(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name) else: set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name, vo_support=True)
def __init__(self, vo_id, storage=None, rest_client=None, host_institute=GENT): """Initialise""" super(VscTier2AccountpageVo, self).__init__(vo_id, rest_client) self.vo_id = vo_id self.vsc = VSC() self.host_institute = host_institute if not storage: self.storage = VscStorage() else: self.storage = storage self.gpfs = GpfsOperations() self.posix = PosixOperations() self.dry_run = False self._vo_data_quota_cache = None self._vo_data_shared_quota_cache = None self._vo_scratch_quota_cache = None self._institute_quota_cache = None self._sharing_group_cache = None
def main(): """Yeah, so, erm. The main function and such.""" options = { "summary": ("Give the summary", None, "store_true", True, 's'), "detail": ( "Detailed information", None, "store_true", False, ), "virtualorganisation": ("Give VO details if available", None, "store_true", False, 'v'), "running": ("Display running job information", None, "store_true", False, 'r'), "idle": ("Display idle job information", None, "store_true", False, 'i'), "blocked": ("Dispay blocked job information", None, "store_true", False, 'b'), 'hosts': ("Hosts/clusters to check", None, 'extend', []), 'location_environment': ('the location for storing the pickle file depending on the cluster', str, 'store', 'VSC_SCRATCH_DELCATTY'), } opts = simple_option(options, config_files=['/etc/myshowq.conf']) if not (opts.options.running or opts.options.idle or opts.options.blocked): opts.options.running = True opts.options.idle = True opts.options.blocked = True storage = VscStorage() user_name = getpwuid(os.getuid())[0] mount_point = storage[opts.options.location_environment].login_mount_point path_template = storage.path_templates[ opts.options.location_environment]['user'] path = os.path.join(mount_point, path_template[0], path_template[1](user_name), ".showq.json.gz") (res, user_map) = read_cache(user_name, opts.options.virtualorganisation, opts.options.running, opts.options.idle, opts.options.blocked, path) if not res or len(res) == 0: print "no data" sys.exit(0) if opts.options.summary: showsummary(opts.options.hosts, res, user_map, user_name, opts.options.virtualorganisation) if opts.options.detail: showdetail()
def __init__(self, user_id, storage=None, pickle_storage=None, rest_client=None, account=None, pubkeys=None, host_institute=None, use_user_cache=False): """ Initialisation. @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5}) """ super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account, pubkeys=pubkeys, use_user_cache=use_user_cache) # Move to vsc-config? default_pickle_storage = { GENT: VSC_SCRATCH_KYUKON, BRUSSEL: VSC_SCRATCH_THEIA, } if host_institute is None: host_institute = GENT self.host_institute = host_institute if pickle_storage is None: pickle_storage = default_pickle_storage[host_institute] self.pickle_storage = pickle_storage if storage is None: storage = VscStorage() self.institute_path_templates = storage.path_templates[ self.host_institute] self.institute_storage = storage[self.host_institute] self.vsc = VSC() self.gpfs = GpfsOperations() # Only used when needed self.posix = PosixOperations()
def main(): """ Main script. - process the users and VOs - write the new timestamp if everything went OK - write the nagios check file """ options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('storage systems on which to deploy users and vos', None, 'extend', []), } opts = ExtendedSimpleOption(options) stats = {} try: storage_settings = VscStorage() gpfs = GpfsOperations() gpfs.list_filesystems() gpfs.list_filesets() for storage_name in opts.options.storage: filesystem_name = storage_settings[storage_name].filesystem filesystem_info = gpfs.get_filesystem_info(filesystem_name) set_up_filesystem(gpfs, storage_settings, storage_name, filesystem_info, filesystem_name, vo_support=True, dry_run=opts.options.dry_run) except Exception as err: logging.exception("critical exception caught: %s", err) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL) opts.epilogue("UGent users and VOs synchronised", stats)
def __init__(self, user_id, storage=None, pickle_storage='VSC_SCRATCH_KYUKON', rest_client=None, account=None, pubkeys=None, host_institute=None, use_user_cache=False): """ Initialisation. @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5}) """ super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account, pubkeys=pubkeys, use_user_cache=use_user_cache) self.pickle_storage = pickle_storage if not storage: self.storage = VscStorage() else: self.storage = storage self.vsc = VSC() self.gpfs = GpfsOperations() # Only used when needed self.posix = PosixOperations() self.host_institute = host_institute
def main(): options = { 'jobid': ('Fully qualified identification of the job', None, 'store', None), 'location_environment': ('the location for storing the pickle file depending on the cluster', str, 'store', 'VSC_SCRATCH_DELCATTY'), } opts = simple_option(options, config_files=['/etc/mycheckjob.conf']) storage = VscStorage() user_name = getpwuid(os.getuid())[0] mount_point = storage[opts.options.location_environment].login_mount_point path_template = storage.path_templates[ opts.options.location_environment]['user'] path = os.path.join(mount_point, path_template[0], path_template[1](user_name), ".checkjob.json.gz") checkjob_info = read_cache(path) print checkjob_info.display(opts.options.jobid)
def main(): """Main script""" options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'storage': ('the VSC filesystems that are checked by this script', None, 'extend', []), 'write-cache': ('Write the data into the cache files in the FS', None, 'store_true', False), 'account_page_url': ('Base URL of the account page', None, 'store', 'https://account.vscentrum.be/django'), 'access_token': ('OAuth2 token to access the account page REST API', None, 'store', None), 'host_institute': ('Name of the institute where this script is being run', str, 'store', GENT), } opts = ExtendedSimpleOption(options) logger = opts.log try: client = AccountpageClient(token=opts.options.access_token) user_id_map = map_uids_to_names() # is this really necessary? gpfs = GpfsOperations() storage = VscStorage() target_filesystems = [ storage[s].filesystem for s in opts.options.storage ] filesystems = gpfs.list_filesystems(device=target_filesystems).keys() logger.debug("Found the following GPFS filesystems: %s" % (filesystems)) filesets = gpfs.list_filesets(devices=target_filesystems) logger.debug("Found the following GPFS filesets: %s" % (filesets)) quota = gpfs.list_quota(devices=target_filesystems) exceeding_filesets = {} exceeding_users = {} stats = {} for storage_name in opts.options.storage: logger.info("Processing quota for storage_name %s" % (storage_name)) filesystem = storage[storage_name].filesystem replication_factor = storage[storage_name].data_replication_factor if filesystem not in filesystems: logger.error("Non-existent filesystem %s" % (filesystem)) continue if filesystem not in quota.keys(): logger.error("No quota defined for storage_name %s [%s]" % (storage_name, filesystem)) continue quota_storage_map = get_mmrepquota_maps( quota[filesystem], storage_name, filesystem, filesets, replication_factor, ) exceeding_filesets[storage_name] = process_fileset_quota( storage, gpfs, storage_name, filesystem, quota_storage_map['FILESET'], client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) exceeding_users[storage_name] = process_user_quota( storage, gpfs, storage_name, None, quota_storage_map['USR'], user_id_map, client, dry_run=opts.options.dry_run, institute=opts.options.host_institute) stats["%s_fileset_critical" % (storage_name, )] = QUOTA_FILESETS_CRITICAL if exceeding_filesets[storage_name]: stats["%s_fileset" % (storage_name, )] = 1 logger.warning( "storage_name %s found %d filesets that are exceeding their quota", storage_name, len(exceeding_filesets)) for (e_fileset, e_quota) in exceeding_filesets[storage_name]: logger.warning("%s has quota %s" % (e_fileset, str(e_quota))) else: stats["%s_fileset" % (storage_name, )] = 0 logger.debug( "storage_name %s found no filesets that are exceeding their quota" % storage_name) stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING stats["%s_users_critical" % (storage_name, )] = QUOTA_USERS_CRITICAL if exceeding_users[storage_name]: stats["%s_users" % (storage_name, )] = len( exceeding_users[storage_name]) logger.warning( "storage_name %s found %d users who are exceeding their quota" % (storage_name, len(exceeding_users[storage_name]))) for (e_user_id, e_quota) in exceeding_users[storage_name]: logger.warning("%s has quota %s" % (e_user_id, str(e_quota))) else: stats["%s_users" % (storage_name, )] = 0 logger.debug( "storage_name %s found no users who are exceeding their quota" % storage_name) except Exception as err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") opts.epilogue("quota check completed", stats)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'account_page_url': ('', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") checkjob_path = opts.configfile_parser.get(host, "checkjob_path") clusters[host] = {'master': master, 'path': checkjob_path} checkjob = SshCheckjob(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) (job_information, _, _) = checkjob.get_moab_command_information() active_users = job_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Checkjob information: %s" % (job_information)) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in active_users: path = get_pickle_path(opts.options.location, user, rest_client) try: user_queue_information = CheckjobInfo( {user: job_information[user]}) store_on_gpfs(user, path, "checkjob", user_queue_information, gpfs, login_mount_point, gpfs_mount_point, ".checkjob.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.exception("Could not store cache file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)
def main(): # Collect all info # Note: debug option is provided by generaloption # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file options = { 'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD, 'hosts': ('the hosts/clusters that should be contacted for job information', None, 'extend', []), 'information': ('the sort of information to store: user, vo, project', None, 'store', 'user'), 'location': ('the location for storing the pickle file: delcatty, muk', str, 'store', 'delcatty'), 'account_page_url': ('the URL at which the account page resides', None, 'store', None), 'access_token': ('the token that will allow authentication against the account page', None, 'store', None), 'target_master': ('the master used to execute showq commands', None, 'store', None), 'target_user': ('the user for ssh to the target master', None, 'store', None), } opts = ExtendedSimpleOption(options) try: rest_client = AccountpageClient(token=opts.options.access_token) gpfs = GpfsOperations() storage = VscStorage() storage_name = cluster_user_pickle_store_map[opts.options.location] login_mount_point = storage[storage_name].login_mount_point gpfs_mount_point = storage[storage_name].gpfs_mount_point clusters = {} for host in opts.options.hosts: master = opts.configfile_parser.get(host, "master") showq_path = opts.configfile_parser.get(host, "showq_path") clusters[host] = {'master': master, 'path': showq_path} logger.debug("clusters = %s" % (clusters, )) showq = SshShowq(opts.options.target_master, opts.options.target_user, clusters, cache_pickle=True, dry_run=opts.options.dry_run) logger.debug("Getting showq information ...") (queue_information, _, _) = showq.get_moab_command_information() timeinfo = time.time() active_users = queue_information.keys() logger.debug("Active users: %s" % (active_users)) logger.debug("Queue information: %s" % (queue_information)) # We need to determine which users should get an updated pickle. This depends on # - the active user set # - the information we want to provide on the cluster(set) where this script runs # At the same time, we need to determine the job information each user gets to see tup = (opts.options.information, active_users, queue_information, rest_client) (target_users, target_queue_information, user_map) = determine_target_information(*tup) nagios_user_count = 0 nagios_no_store = 0 stats = {} for user in target_users: try: path = get_pickle_path(opts.options.location, user, rest_client) user_queue_information = target_queue_information[user] user_queue_information['timeinfo'] = timeinfo store_on_gpfs(user, path, "showq", (user_queue_information, user_map[user]), gpfs, login_mount_point, gpfs_mount_point, ".showq.json.gz", opts.options.dry_run) nagios_user_count += 1 except Exception: logger.error("Could not store pickle file for user %s" % (user)) nagios_no_store += 1 stats["store_users"] = nagios_user_count stats["store_fail"] = nagios_no_store stats["store_fail_critical"] = STORE_LIMIT_CRITICAL except Exception, err: logger.exception("critical exception caught: %s" % (err)) opts.critical("Script failed in a horrible way") sys.exit(NAGIOS_EXIT_CRITICAL)