def main():

    storage_settings = VscStorage()

    local_storage_conf = configparser.SafeConfigParser()
    local_storage_conf.read(QUOTA_CONF_FILE)

    gpfs = GpfsOperations()
    gpfs.list_filesystems()
    gpfs.list_filesets()

    for storage_name in local_storage_conf.get('MAIN', 'storage').split(','):

        filesystem_name = storage_settings[storage_name].filesystem
        filesystem_info = gpfs.get_filesystem_info(filesystem_name)

        if storage_name in ('VSC_HOME'):
            set_up_filesystem(gpfs, storage_settings, storage_name,
                              filesystem_info, filesystem_name)
            set_up_apps(gpfs, storage_settings, storage_name, filesystem_info,
                        filesystem_name)
        else:
            set_up_filesystem(gpfs,
                              storage_settings,
                              storage_name,
                              filesystem_info,
                              filesystem_name,
                              vo_support=True)
Пример #2
0
    def __init__(self,
                 vo_id,
                 storage=None,
                 rest_client=None,
                 host_institute=GENT):
        """Initialise"""
        super(VscTier2AccountpageVo, self).__init__(vo_id, rest_client)

        self.vo_id = vo_id
        self.vsc = VSC()
        self.host_institute = host_institute

        if not storage:
            self.storage = VscStorage()
        else:
            self.storage = storage

        self.gpfs = GpfsOperations()
        self.posix = PosixOperations()

        self.dry_run = False

        self._vo_data_quota_cache = None
        self._vo_data_shared_quota_cache = None
        self._vo_scratch_quota_cache = None
        self._institute_quota_cache = None

        self._sharing_group_cache = None
Пример #3
0
def main():
    """Yeah, so, erm. The main function and such."""

    options = {
        "summary": ("Give the summary", None, "store_true", True, 's'),
        "detail": (
            "Detailed information",
            None,
            "store_true",
            False,
        ),
        "virtualorganisation":
        ("Give VO details if available", None, "store_true", False, 'v'),
        "running":
        ("Display running job information", None, "store_true", False, 'r'),
        "idle":
        ("Display idle job information", None, "store_true", False, 'i'),
        "blocked": ("Dispay blocked job information", None, "store_true",
                    False, 'b'),
        'hosts': ("Hosts/clusters to check", None, 'extend', []),
        'location_environment':
        ('the location for storing the pickle file depending on the cluster',
         str, 'store', 'VSC_SCRATCH_DELCATTY'),
    }

    opts = simple_option(options, config_files=['/etc/myshowq.conf'])

    if not (opts.options.running or opts.options.idle or opts.options.blocked):
        opts.options.running = True
        opts.options.idle = True
        opts.options.blocked = True

    storage = VscStorage()
    user_name = getpwuid(os.getuid())[0]

    mount_point = storage[opts.options.location_environment].login_mount_point
    path_template = storage.path_templates[
        opts.options.location_environment]['user']
    path = os.path.join(mount_point, path_template[0],
                        path_template[1](user_name), ".showq.json.gz")

    (res, user_map) = read_cache(user_name, opts.options.virtualorganisation,
                                 opts.options.running, opts.options.idle,
                                 opts.options.blocked, path)

    if not res or len(res) == 0:
        print "no data"
        sys.exit(0)

    if opts.options.summary:
        showsummary(opts.options.hosts, res, user_map, user_name,
                    opts.options.virtualorganisation)
    if opts.options.detail:
        showdetail()
Пример #4
0
    def __init__(self,
                 user_id,
                 storage=None,
                 pickle_storage=None,
                 rest_client=None,
                 account=None,
                 pubkeys=None,
                 host_institute=None,
                 use_user_cache=False):
        """
        Initialisation.
        @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5})
        """
        super(VscTier2AccountpageUser,
              self).__init__(user_id,
                             rest_client,
                             account=account,
                             pubkeys=pubkeys,
                             use_user_cache=use_user_cache)

        # Move to vsc-config?
        default_pickle_storage = {
            GENT: VSC_SCRATCH_KYUKON,
            BRUSSEL: VSC_SCRATCH_THEIA,
        }

        if host_institute is None:
            host_institute = GENT
        self.host_institute = host_institute

        if pickle_storage is None:
            pickle_storage = default_pickle_storage[host_institute]

        self.pickle_storage = pickle_storage
        if storage is None:
            storage = VscStorage()

        self.institute_path_templates = storage.path_templates[
            self.host_institute]
        self.institute_storage = storage[self.host_institute]

        self.vsc = VSC()
        self.gpfs = GpfsOperations()  # Only used when needed
        self.posix = PosixOperations()
Пример #5
0
def main():
    """
    Main script.
    - process the users and VOs
    - write the new timestamp if everything went OK
    - write the nagios check file
    """

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('storage systems on which to deploy users and vos', None,
                    'extend', []),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    try:
        storage_settings = VscStorage()
        gpfs = GpfsOperations()
        gpfs.list_filesystems()
        gpfs.list_filesets()

        for storage_name in opts.options.storage:

            filesystem_name = storage_settings[storage_name].filesystem
            filesystem_info = gpfs.get_filesystem_info(filesystem_name)

            set_up_filesystem(gpfs,
                              storage_settings,
                              storage_name,
                              filesystem_info,
                              filesystem_name,
                              vo_support=True,
                              dry_run=opts.options.dry_run)

    except Exception as err:
        logging.exception("critical exception caught: %s", err)
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("UGent users and VOs synchronised", stats)
Пример #6
0
    def __init__(self, user_id, storage=None, pickle_storage='VSC_SCRATCH_KYUKON', rest_client=None,
                 account=None, pubkeys=None, host_institute=None, use_user_cache=False):
        """
        Initialisation.
        @type vsc_user_id: string representing the user's VSC ID (vsc[0-9]{5})
        """
        super(VscTier2AccountpageUser, self).__init__(user_id, rest_client, account=account,
                                                      pubkeys=pubkeys, use_user_cache=use_user_cache)

        self.pickle_storage = pickle_storage
        if not storage:
            self.storage = VscStorage()
        else:
            self.storage = storage

        self.vsc = VSC()
        self.gpfs = GpfsOperations()  # Only used when needed
        self.posix = PosixOperations()
        self.host_institute = host_institute
Пример #7
0
def main():

    options = {
        'jobid':
        ('Fully qualified identification of the job', None, 'store', None),
        'location_environment':
        ('the location for storing the pickle file depending on the cluster',
         str, 'store', 'VSC_SCRATCH_DELCATTY'),
    }
    opts = simple_option(options, config_files=['/etc/mycheckjob.conf'])

    storage = VscStorage()
    user_name = getpwuid(os.getuid())[0]

    mount_point = storage[opts.options.location_environment].login_mount_point
    path_template = storage.path_templates[
        opts.options.location_environment]['user']
    path = os.path.join(mount_point, path_template[0],
                        path_template[1](user_name), ".checkjob.json.gz")

    checkjob_info = read_cache(path)

    print checkjob_info.display(opts.options.jobid)
Пример #8
0
def main():
    """Main script"""

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('the VSC filesystems that are checked by this script',
                    None, 'extend', []),
        'write-cache': ('Write the data into the cache files in the FS', None,
                        'store_true', False),
        'account_page_url': ('Base URL of the account page', None, 'store',
                             'https://account.vscentrum.be/django'),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }
    opts = ExtendedSimpleOption(options)
    logger = opts.log

    try:
        client = AccountpageClient(token=opts.options.access_token)

        user_id_map = map_uids_to_names()  # is this really necessary?
        gpfs = GpfsOperations()
        storage = VscStorage()

        target_filesystems = [
            storage[s].filesystem for s in opts.options.storage
        ]

        filesystems = gpfs.list_filesystems(device=target_filesystems).keys()
        logger.debug("Found the following GPFS filesystems: %s" %
                     (filesystems))

        filesets = gpfs.list_filesets(devices=target_filesystems)
        logger.debug("Found the following GPFS filesets: %s" % (filesets))

        quota = gpfs.list_quota(devices=target_filesystems)
        exceeding_filesets = {}
        exceeding_users = {}
        stats = {}

        for storage_name in opts.options.storage:

            logger.info("Processing quota for storage_name %s" %
                        (storage_name))
            filesystem = storage[storage_name].filesystem
            replication_factor = storage[storage_name].data_replication_factor

            if filesystem not in filesystems:
                logger.error("Non-existent filesystem %s" % (filesystem))
                continue

            if filesystem not in quota.keys():
                logger.error("No quota defined for storage_name %s [%s]" %
                             (storage_name, filesystem))
                continue

            quota_storage_map = get_mmrepquota_maps(
                quota[filesystem],
                storage_name,
                filesystem,
                filesets,
                replication_factor,
            )

            exceeding_filesets[storage_name] = process_fileset_quota(
                storage,
                gpfs,
                storage_name,
                filesystem,
                quota_storage_map['FILESET'],
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            exceeding_users[storage_name] = process_user_quota(
                storage,
                gpfs,
                storage_name,
                None,
                quota_storage_map['USR'],
                user_id_map,
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            stats["%s_fileset_critical" %
                  (storage_name, )] = QUOTA_FILESETS_CRITICAL
            if exceeding_filesets[storage_name]:
                stats["%s_fileset" % (storage_name, )] = 1
                logger.warning(
                    "storage_name %s found %d filesets that are exceeding their quota",
                    storage_name, len(exceeding_filesets))
                for (e_fileset, e_quota) in exceeding_filesets[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_fileset, str(e_quota)))
            else:
                stats["%s_fileset" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no filesets that are exceeding their quota"
                    % storage_name)

            stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING
            stats["%s_users_critical" %
                  (storage_name, )] = QUOTA_USERS_CRITICAL
            if exceeding_users[storage_name]:
                stats["%s_users" % (storage_name, )] = len(
                    exceeding_users[storage_name])
                logger.warning(
                    "storage_name %s found %d users who are exceeding their quota"
                    % (storage_name, len(exceeding_users[storage_name])))
                for (e_user_id, e_quota) in exceeding_users[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_user_id, str(e_quota)))
            else:
                stats["%s_users" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no users who are exceeding their quota"
                    % storage_name)

    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")

    opts.epilogue("quota check completed", stats)
Пример #9
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'location': ('the location for storing the pickle file: delcatty, muk',
                     str, 'store', 'delcatty'),
        'access_token':
        ('the token that will allow authentication against the account page',
         None, 'store', None),
        'account_page_url': ('', None, 'store', None),
        'target_master':
        ('the master used to execute showq commands', None, 'store', None),
        'target_user':
        ('the user for ssh to the target master', None, 'store', None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            checkjob_path = opts.configfile_parser.get(host, "checkjob_path")
            clusters[host] = {'master': master, 'path': checkjob_path}

        checkjob = SshCheckjob(opts.options.target_master,
                               opts.options.target_user,
                               clusters,
                               cache_pickle=True,
                               dry_run=opts.options.dry_run)

        (job_information, _, _) = checkjob.get_moab_command_information()

        active_users = job_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Checkjob information: %s" % (job_information))

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in active_users:
            path = get_pickle_path(opts.options.location, user, rest_client)
            try:
                user_queue_information = CheckjobInfo(
                    {user: job_information[user]})
                store_on_gpfs(user, path, "checkjob", user_queue_information,
                              gpfs, login_mount_point, gpfs_mount_point,
                              ".checkjob.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.exception("Could not store cache file for user %s" %
                                 (user))
                nagios_no_store += 1
        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)
Пример #10
0
def main():
    # Collect all info

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'hosts':
        ('the hosts/clusters that should be contacted for job information',
         None, 'extend', []),
        'information': ('the sort of information to store: user, vo, project',
                        None, 'store', 'user'),
        'location': ('the location for storing the pickle file: delcatty, muk',
                     str, 'store', 'delcatty'),
        'account_page_url':
        ('the URL at which the account page resides', None, 'store', None),
        'access_token':
        ('the token that will allow authentication against the account page',
         None, 'store', None),
        'target_master':
        ('the master used to execute showq commands', None, 'store', None),
        'target_user': ('the user for ssh to the target master', None, 'store',
                        None),
    }

    opts = ExtendedSimpleOption(options)

    try:
        rest_client = AccountpageClient(token=opts.options.access_token)

        gpfs = GpfsOperations()
        storage = VscStorage()
        storage_name = cluster_user_pickle_store_map[opts.options.location]
        login_mount_point = storage[storage_name].login_mount_point
        gpfs_mount_point = storage[storage_name].gpfs_mount_point

        clusters = {}
        for host in opts.options.hosts:
            master = opts.configfile_parser.get(host, "master")
            showq_path = opts.configfile_parser.get(host, "showq_path")
            clusters[host] = {'master': master, 'path': showq_path}

        logger.debug("clusters = %s" % (clusters, ))
        showq = SshShowq(opts.options.target_master,
                         opts.options.target_user,
                         clusters,
                         cache_pickle=True,
                         dry_run=opts.options.dry_run)

        logger.debug("Getting showq information ...")

        (queue_information, _, _) = showq.get_moab_command_information()
        timeinfo = time.time()

        active_users = queue_information.keys()

        logger.debug("Active users: %s" % (active_users))
        logger.debug("Queue information: %s" % (queue_information))

        # We need to determine which users should get an updated pickle. This depends on
        # - the active user set
        # - the information we want to provide on the cluster(set) where this script runs
        # At the same time, we need to determine the job information each user gets to see
        tup = (opts.options.information, active_users, queue_information,
               rest_client)
        (target_users, target_queue_information,
         user_map) = determine_target_information(*tup)

        nagios_user_count = 0
        nagios_no_store = 0

        stats = {}

        for user in target_users:
            try:
                path = get_pickle_path(opts.options.location, user,
                                       rest_client)
                user_queue_information = target_queue_information[user]
                user_queue_information['timeinfo'] = timeinfo
                store_on_gpfs(user, path, "showq",
                              (user_queue_information, user_map[user]), gpfs,
                              login_mount_point, gpfs_mount_point,
                              ".showq.json.gz", opts.options.dry_run)
                nagios_user_count += 1
            except Exception:
                logger.error("Could not store pickle file for user %s" %
                             (user))
                nagios_no_store += 1

        stats["store_users"] = nagios_user_count
        stats["store_fail"] = nagios_no_store
        stats["store_fail_critical"] = STORE_LIMIT_CRITICAL
    except Exception, err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)