Python ExtendedSimpleOption.epilogue 예제들, vsc.utils.script_tools.ExtendedSimpleOption.epilogue Python 예제들

예제 #1

0

파일 보기

파일: inode_log.py 프로젝트: boegel/vsc-filesystems

def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location': ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    try:
        gpfs = GpfsOperations()
        filesets = gpfs.list_filesets()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" % (filesystem,)] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_inodes_%s_%s.gz" % (time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem,)] = 0
                logger.info("Stored inodes information for FS %s" % (filesystem))

                cfs = process_inodes_information(filesets[filesystem])
                logger.info("Processed inodes information for filesystem %s" % (filesystem,))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info("Filesystem %s has at least %d filesets reaching the limit" % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem,)] = 1
                logger.exception("Failed storing inodes information for FS %s" % (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets,))

        if critical_filesets:
            mail_admins(critical_filesets, opts.options.dry_run)

    except Exception:
        logger.exception("Failure obtaining GPFS inodes")
        opts.critical("Failure to obtain GPFS inodes information")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("Logged GPFS inodes", stats)

예제 #2

0

파일 보기

def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
        'backend': ('Storage backend', None, 'store', 'gpfs'),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    backend = opts.options.backend
    try:
        if backend == 'gpfs':
            storage_backend = GpfsOperations()
        elif backend == 'lustre':
            storage_backend = LustreOperations()
        else:
            logger.exception("Backend %s not supported", backend)

        quota = storage_backend.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        for key in quota:
            stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL
            try:
                filename = "%s_quota_%s_%s.gz" % (
                    backend, time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]).encode())
                zipfile.close()
                stats["%s_quota_log" % (key, )] = 0
                logger.info("Stored quota information for FS %s", key)
            except Exception:
                stats["%s_quota_log" % (key, )] = 1
                logger.exception("Failed storing quota information for FS %s",
                                 key)
    except Exception:
        logger.exception("Failure obtaining %s quota", backend)
        opts.critical("Failure to obtain %s quota information" % backend)

    opts.epilogue("Logged %s quota" % backend, stats)

예제 #3

0

파일 보기

파일: run_lmod_cache.py 프로젝트: boegel/vsc-modules

def main():
    """
    Set the options and initiates the main run.
    Returns the errors if any in a nagios/icinga friendly way.
    """
    options = {
        'nagios-check-interval-threshold': NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'create-cache': ('Create the Lmod cache', None, 'store_true', False),
        'freshness-threshold': ('The interval in minutes for how long we consider the cache to be fresh',
                                'int', 'store', 120),
    }
    opts = ExtendedSimpleOption(options)

    try:
        if opts.options.create_cache:
            opts.log.info("Updating the Lmod cache")
            exitcode, msg = run_cache_create()
            if exitcode != 0:
                opts.log.error("Lmod cache update failed: %s", msg)
                opts.critical("Lmod cache update failed")

            try:
                convert_lmod_cache_to_json()
            except Exception as err:
                opts.log.exception("Lmod to JSON failed: %s", err)
                opts.critical("Lmod to JSON failed.")

        opts.log.info("Checking the Lmod cache freshness")
        timestamp = os.stat(get_lmod_conf()['timestamp'])

        # give a warning when the cache is older then --freshness-threshold
        if (time.time() - timestamp.st_mtime) > opts.options.freshness_threshold * 60:
            errmsg = "Lmod cache is not fresh"
            opts.log.warn(errmsg)
            opts.warning(errmsg)

    except RuntimeError as err:
        opts.log.exception("Failed to update Lmod cache: %s", err)
        opts.critical("Failed to update Lmod cache. See logs.")
    except Exception as err:  # pylint: disable=W0703
        opts.log.exception("critical exception caught: %s", err)
        opts.critical("Script failed because of uncaught exception. See logs.")

    if opts.options.create_cache:
        opts.epilogue("Lmod cache updated.")
    else:
        opts.epilogue("Lmod cache is still fresh.")

예제 #4

0

파일 보기

파일: quota_log.py 프로젝트: wpoely86/vsc-filesystems-quota

def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', QUOTA_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)

    stats = {}

    try:
        gpfs = GpfsOperations()
        quota = gpfs.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0755)

        for key in quota:
            stats["%s_quota_log_critical" % (key, )] = QUOTA_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_quota_%s_%s.gz" % (
                    time.strftime("%Y%m%d-%H:%M"), key)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(quota[key]))
                zipfile.close()
                stats["%s_quota_log" % (key, )] = 0
                logger.info("Stored quota information for FS %s" % (key))
            except Exception:
                stats["%s_quota_log" % (key, )] = 1
                logger.exception("Failed storing quota information for FS %s" %
                                 (key))
    except Exception:
        logger.exception("Failure obtaining GPFS quota")
        opts.critical("Failure to obtain GPFS quota information")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("Logged GPFS quota", stats)

예제 #5

0

파일 보기

def main():
    """
    Main script.
    - process the users and VOs
    - write the new timestamp if everything went OK
    - write the nagios check file
    """

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('storage systems on which to deploy users and vos', None,
                    'extend', []),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    try:
        storage_settings = VscStorage()
        gpfs = GpfsOperations()
        gpfs.list_filesystems()
        gpfs.list_filesets()

        for storage_name in opts.options.storage:

            filesystem_name = storage_settings[storage_name].filesystem
            filesystem_info = gpfs.get_filesystem_info(filesystem_name)

            set_up_filesystem(gpfs,
                              storage_settings,
                              storage_name,
                              filesystem_info,
                              filesystem_name,
                              vo_support=True,
                              dry_run=opts.options.dry_run)

    except Exception as err:
        logging.exception("critical exception caught: %s", err)
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("UGent users and VOs synchronised", stats)

예제 #6

0

파일 보기

def main():
    """
    Main script.
    - build the filter
    - fetches the users
    - process the users
    - write the new timestamp if everything went OK
    - write the nagios check file
    """

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('storage systems on which to deploy users and vos', None,
                    'extend', []),
        'user': ('process users', None, 'store_true', False),
        'vo': ('process vos', None, 'store_true', False),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'account_page_url':
        ('URL of the account page where we can find the REST API', None,
         'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    try:
        now = datetime.utcnow()
        client = AccountpageClient(token=opts.options.access_token,
                                   url=opts.options.account_page_url + "/api/")

        try:
            last_timestamp = read_timestamp(SYNC_TIMESTAMP_FILENAME)
        except Exception:
            logger.exception("Something broke reading the timestamp from %s" %
                             SYNC_TIMESTAMP_FILENAME)
            last_timestamp = "200901010000Z"

        logger.info("Last recorded timestamp was %s" % (last_timestamp))
        last_timestamp = convert_to_unix_timestamp(last_timestamp)

        (users_ok, users_fail) = ([], [])
        (quota_ok, quota_fail) = ([], [])
        if opts.options.user:
            ugent_changed_accounts = client.account.institute['gent'].modified[
                last_timestamp].get()[1]

            logger.info(
                "Found %d UGent accounts that have changed in the accountpage since %s"
                % (len(ugent_changed_accounts), last_timestamp))

            ugent_accounts = [u['vsc_id'] for u in ugent_changed_accounts]
            ugent_accounts = nub(ugent_accounts)

            for storage_name in opts.options.storage:
                (users_ok,
                 users_fail) = process_users(opts.options, ugent_accounts,
                                             storage_name, client,
                                             opts.options.host_institute)
                stats["%s_users_sync" % (storage_name, )] = len(users_ok)
                stats["%s_users_sync_fail" %
                      (storage_name, )] = len(users_fail)
                stats["%s_users_sync_fail_warning" %
                      (storage_name, )] = STORAGE_USERS_LIMIT_WARNING
                stats["%s_users_sync_fail_critical" %
                      (storage_name, )] = STORAGE_USERS_LIMIT_CRITICAL

            for storage_name in opts.options.storage:
                storage_changed_quota = [
                    mkVscUserSizeQuota(q) for q in client.quota.user.
                    storage[storage_name].modified[last_timestamp].get()[1]
                ]
                storage_changed_quota = [
                    q for q in storage_changed_quota
                    if q.fileset.startswith('vsc')
                ]
                logger.info(
                    "Found %d accounts that have changed quota on storage %s in the accountpage since %s",
                    len(storage_changed_quota), storage_name, last_timestamp)
                (quota_ok, quota_fail) = process_users_quota(
                    opts.options, storage_changed_quota, storage_name, client,
                    opts.options.host_institute)
                stats["%s_quota_sync" % (storage_name, )] = len(quota_ok)
                stats["%s_quota_sync_fail" %
                      (storage_name, )] = len(quota_fail)
                stats["%s_quota_sync_fail_warning" %
                      (storage_name, )] = STORAGE_QUOTA_LIMIT_WARNING
                stats["%s_quota_sync_fail_critical" %
                      (storage_name, )] = STORAGE_QUOTA_LIMIT_CRITICAL

        (vos_ok, vos_fail) = ([], [])
        if opts.options.vo:
            ugent_changed_vos = client.vo.modified[last_timestamp].get()[1]
            ugent_changed_vo_quota = client.quota.vo.modified[
                last_timestamp].get()[1]

            ugent_vos = sorted(
                set([v['vsc_id'] for v in ugent_changed_vos] + [
                    v['virtual_organisation'] for v in ugent_changed_vo_quota
                ]))

            logger.info(
                "Found %d UGent VOs that have changed in the accountpage since %s"
                % (len(ugent_changed_vos), last_timestamp))
            logger.info(
                "Found %d UGent VOs that have changed quota in the accountpage since %s"
                % (len(ugent_changed_vo_quota), last_timestamp))
            logger.debug(
                "Found the following UGent VOs: {vos}".format(vos=ugent_vos))

            for storage_name in opts.options.storage:
                (vos_ok, vos_fail) = process_vos(opts.options, ugent_vos,
                                                 storage_name, client,
                                                 last_timestamp,
                                                 opts.options.host_institute)
                stats["%s_vos_sync" % (storage_name, )] = len(vos_ok)
                stats["%s_vos_sync_fail" % (storage_name, )] = len(vos_fail)
                stats["%s_vos_sync_fail_warning" %
                      (storage_name, )] = STORAGE_VO_LIMIT_WARNING
                stats["%s_vos_sync_fail_critical" %
                      (storage_name, )] = STORAGE_VO_LIMIT_CRITICAL

        if not (users_fail or quota_fail or vos_fail):
            (_, ldap_timestamp) = convert_timestamp(now)
            if not opts.options.dry_run:
                write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp)
    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("UGent users and VOs synchronised", stats)

예제 #7

0

파일 보기

파일: sync_slurm_acct.py 프로젝트: itkovian/vsc-administration

def main():
    """
    Main script. The usual.
    """

    options = {
        "nagios-check-interval-threshold":
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        "access_token": ("OAuth2 token to access the account page REST API",
                         None, "store", None),
        "account_page_url": (
            "URL of the account page where we can find the REST API",
            str,
            "store",
            "https://apivsc.ugent.be/django",
        ),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
        "clusters": (
            "Cluster(s) (comma-separated) to sync for. "
            "Overrides <host_institute>_SLURM_COMPUTE_CLUSTERS that are in production.",
            "strlist",
            "store",
            [],
        ),
        'start_timestamp':
        ('Timestamp to start the sync from', str, 'store', None),
        'cluster_classes':
        ('Classes of clusters that should be synced, comma-separated',
         "strlist", 'store', [PRODUCTION, PILOT])
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    (last_timestamp, start_time) = retrieve_timestamp_with_default(
        SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp)
    logging.info("Using timestamp %s", last_timestamp)
    logging.info("Using startime %s", start_time)

    try:
        client = AccountpageClient(token=opts.options.access_token,
                                   url=opts.options.account_page_url + "/api/")
        host_institute = opts.options.host_institute

        slurm_account_info = get_slurm_acct_info(SyncTypes.accounts)
        slurm_user_info = get_slurm_acct_info(SyncTypes.users)

        logging.debug("%d accounts found", len(slurm_account_info))
        logging.debug("%d users found", len(slurm_user_info))

        if opts.options.clusters:
            clusters = opts.options.clusters
        else:
            clusters = [
                cs for p in opts.options.cluster_classes
                for cs in VSC_SLURM_CLUSTERS[host_institute][p]
            ]
        sacctmgr_commands = []

        # All users belong to a VO, so fetching the VOs is necessary/
        account_page_vos = [
            mkVo(v)
            for v in client.vo.institute[opts.options.host_institute].get()[1]
        ]

        # make sure the institutes and the default accounts (VOs) are there for each cluster
        institute_vos = dict([
            (v.vsc_id, v) for v in account_page_vos
            if v.vsc_id in INSTITUTE_VOS_BY_INSTITUTE[host_institute].values()
        ])
        sacctmgr_commands += slurm_institute_accounts(slurm_account_info,
                                                      clusters, host_institute,
                                                      institute_vos)

        # The VOs do not track active state of users, so we need to fetch all accounts as well
        active_accounts = set(
            [a["vsc_id"] for a in client.account.get()[1] if a["isactive"]])

        # dictionary mapping the VO vsc_id on a tuple with the VO members and the VO itself
        account_page_members = dict([(vo.vsc_id, (set(vo.members), vo))
                                     for vo in account_page_vos])

        # process all regular VOs
        sacctmgr_commands += slurm_vo_accounts(account_page_vos,
                                               slurm_account_info, clusters,
                                               host_institute)

        # process VO members
        sacctmgr_commands += slurm_user_accounts(account_page_members,
                                                 active_accounts,
                                                 slurm_user_info, clusters,
                                                 opts.options.dry_run)

        logging.info("Executing %d commands", len(sacctmgr_commands))

        if opts.options.dry_run:
            print("Commands to be executed:\n")
            print("\n".join([" ".join(c) for c in sacctmgr_commands]))
        else:
            execute_commands(sacctmgr_commands)

        if not opts.options.dry_run:
            (_, ldap_timestamp) = convert_timestamp(start_time)
            write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp)
            opts.epilogue("Accounts synced to slurm", stats)
        else:
            logging.info("Dry run done")

    except Exception as err:
        logging.exception("critical exception caught: %s", err)
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

예제 #8

0

파일 보기

파일: sync_django_ldap.py 프로젝트: stdweird/vsc-administration-1

def main():

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'start-timestamp':
        ("The timestamp form which to start, otherwise use the cached value",
         None, "store", None),
        'access_token':
        ('OAuth2 token identifying the user with the accountpage', None,
         'store', None),
        'account_page_url': ('url for the account page', None, 'store', None),
        'start_timestamp':
        ('Timestamp to start the sync from', str, 'store', None),
    }
    # get access_token from conf file
    ExtendedSimpleOption.CONFIGFILES_INIT = ['/etc/account_page.conf']
    opts = ExtendedSimpleOption(options)
    stats = {}

    # Creating this here because this is a singleton class
    _ = LdapQuery(VscConfiguration(VSC_CONF_DEFAULT_FILENAME))

    (last_timestamp, start_time) = retrieve_timestamp_with_default(
        SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp)
    logging.info("Using timestamp %s", last_timestamp)
    logging.info("Using startime %s", start_time)

    try:
        parent_pid = os.fork()
        logging.info("Forked.")
    except OSError:
        logging.exception("Could not fork")
        parent_pid = 1
    except Exception:
        logging.exception("Oops")
        parent_pid = 1

    if parent_pid == 0:
        try:
            global logger
            logger = fancylogger.getLogger(NAGIOS_HEADER)
            # drop privileges in the child
            try:
                apache_uid = pwd.getpwnam('apache').pw_uid
                apache_gid = grp.getgrnam('apache').gr_gid

                os.setgroups([])
                os.setgid(apache_gid)
                os.setuid(apache_uid)

                logging.info("Now running as %s" % (os.geteuid(), ))
            except OSError:
                logger.raiseException("Could not drop privileges")

            client = AccountpageClient(token=opts.options.access_token,
                                       url=opts.options.account_page_url +
                                       '/api/')
            syncer = LdapSyncer(client)
            last = last_timestamp
            altered_accounts = syncer.sync_altered_accounts(
                last, opts.options.dry_run)

            logging.debug("Altered accounts: %s", altered_accounts)

            altered_groups = syncer.sync_altered_groups(
                last, opts.options.dry_run)

            logging.debug("Altered groups: %s" % altered_groups)

            if not altered_accounts[ERROR] \
                    and not altered_groups[ERROR]:
                logging.info("Child process exiting correctly")
                sys.exit(0)
            else:
                logging.info("Child process exiting with status -1")
                logging.warning("Error occured in %s" % ([
                    "%s: %s\n" % (k, v) for (k, v) in [
                        ("altered accounts", altered_accounts[ERROR]),
                        ("altered groups", altered_groups[ERROR]),
                    ]
                ]))
                sys.exit(-1)
        except Exception:
            logging.exception("Child caught an exception")
            sys.exit(-1)

    else:
        # parent
        (_, result) = os.waitpid(parent_pid, 0)
        logging.info("Child exited with exit code %d" % (result, ))

        if not result and not opts.options.dry_run:
            (_, ldap_timestamp) = convert_timestamp(start_time)
            write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp)
            opts.epilogue("Synchronised LDAP users to the Django DB", stats)
        else:
            sys.exit(NAGIOS_EXIT_CRITICAL)

예제 #9

0

파일 보기

파일: sync_django_ldap.py 프로젝트: wpoely86/vsc-administration

def main():

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'start-timestamp':
        ("The timestamp form which to start, otherwise use the cached value",
         None, "store", None),
        'access_token':
        ('OAuth2 token identifying the user with the accountpage', None,
         'store', None),
        'account_page_url': ('url for the account page', None, 'store', None),
    }
    # get access_token from conf file
    ExtendedSimpleOption.CONFIGFILES_INIT = ['/etc/account_page.conf']
    opts = ExtendedSimpleOption(options)
    stats = {}

    # Creating this here because this is a singleton class
    _ = LdapQuery(VscConfiguration(VSC_CONF_DEFAULT_FILENAME))

    last_timestamp = opts.options.start_timestamp
    if not last_timestamp:
        try:
            last_timestamp = read_timestamp(SYNC_TIMESTAMP_FILENAME)
        except Exception:
            _log.warning("Something broke reading the timestamp from %s",
                         SYNC_TIMESTAMP_FILENAME)
            last_timestamp = "201710230000Z"
            _log.warning(
                "We will resync from a hardcoded know working sync a while back : %s",
                last_timestamp)

    _log.info("Using timestamp %s", last_timestamp)
    # record starttime before starting, and take a 10 sec safety buffer so we don't get gaps where users are approved
    # in between the requesting of modified users and writing out the start time
    start_time = datetime.datetime.now() + datetime.timedelta(seconds=-10)
    _log.info("startime %s", start_time)

    try:
        parent_pid = os.fork()
        _log.info("Forked.")
    except OSError:
        _log.exception("Could not fork")
        parent_pid = 1
    except Exception:
        _log.exception("Oops")
        parent_pid = 1

    if parent_pid == 0:
        try:
            global _log
            _log = fancylogger.getLogger(NAGIOS_HEADER)
            # drop privileges in the child
            try:
                apache_uid = pwd.getpwnam('apache').pw_uid
                apache_gid = grp.getgrnam('apache').gr_gid

                os.setgroups([])
                os.setgid(apache_gid)
                os.setuid(apache_uid)

                _log.info("Now running as %s" % (os.geteuid(), ))
            except OSError:
                _log.raiseException("Could not drop privileges")

            client = AccountpageClient(token=opts.options.access_token,
                                       url=opts.options.account_page_url +
                                       '/api/')
            syncer = LdapSyncer(client)
            last = int(
                (datetime.datetime.strptime(last_timestamp, "%Y%m%d%H%M%SZ") -
                 datetime.datetime(1970, 1, 1)).total_seconds())
            altered_accounts = syncer.sync_altered_accounts(
                last, opts.options.dry_run)

            _log.debug("Altered accounts: %s", altered_accounts)

            altered_groups = syncer.sync_altered_groups(
                last, opts.options.dry_run)

            _log.debug("Altered groups: %s" % altered_groups)

            if not altered_accounts[ERROR] \
                    and not altered_groups[ERROR]:
                _log.info("Child process exiting correctly")
                sys.exit(0)
            else:
                _log.info("Child process exiting with status -1")
                _log.warning("Error occured in %s" % ([
                    "%s: %s\n" % (k, v) for (k, v) in [
                        ("altered accounts", altered_accounts[ERROR]),
                        ("altered groups", altered_groups[ERROR]),
                    ]
                ]))
                sys.exit(-1)
        except Exception:
            _log.exception("Child caught an exception")
            sys.exit(-1)

    else:
        # parent
        (_, result) = os.waitpid(parent_pid, 0)
        _log.info("Child exited with exit code %d" % (result, ))

        if not result:
            if not opts.options.start_timestamp:
                (_, ldap_timestamp) = convert_timestamp(start_time)
                if not opts.options.dry_run:
                    write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp)
            else:
                _log.info(
                    "Not updating the timestamp, since one was provided on the command line"
                )
            opts.epilogue("Synchronised LDAP users to the Django DB", stats)
        else:
            _log.info(
                "Not updating the timestamp, since it was given on the command line for this run"
            )
            sys.exit(NAGIOS_EXIT_CRITICAL)

예제 #10

0

파일 보기

def main():
    """
    Main script.
    - build the filter
    - fetches the users
    - process the users
    - write the new timestamp if everything went OK
    - write the nagios check file
    """

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('storage systems on which to deploy users and vos', None,
                    'extend', []),
        'user': ('process users', None, 'store_true', False),
        'vo': ('process vos', None, 'store_true', False),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'account_page_url':
        ('URL of the account page where we can find the REST API', None,
         'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
        'start_timestamp':
        ('Timestamp to start the sync from', str, 'store', None),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    (last_timestamp, start_time) = retrieve_timestamp_with_default(
        SYNC_TIMESTAMP_FILENAME, start_timestamp=opts.options.start_timestamp)
    logging.info("Using timestamp %s", last_timestamp)
    logging.info("Using startime %s", start_time)

    try:
        client = AccountpageClient(token=opts.options.access_token,
                                   url=opts.options.account_page_url + "/api/")

        institute = opts.options.host_institute

        (users_ok, users_fail) = ([], [])
        (quota_ok, quota_fail) = ([], [])
        if opts.options.user:
            changed_accounts = client.account.institute[institute].modified[
                last_timestamp].get()[1]

            logging.info(
                "Found %d %s accounts that have changed in the accountpage since %s"
                % (len(changed_accounts), institute, last_timestamp))

            accounts = nub([u['vsc_id'] for u in changed_accounts])

            for storage_name in opts.options.storage:
                (users_ok, users_fail) = process_users(opts.options, accounts,
                                                       storage_name, client,
                                                       institute)
                stats["%s_users_sync" % (storage_name, )] = len(users_ok)
                stats["%s_users_sync_fail" %
                      (storage_name, )] = len(users_fail)
                stats["%s_users_sync_fail_warning" %
                      (storage_name, )] = STORAGE_USERS_LIMIT_WARNING
                stats["%s_users_sync_fail_critical" %
                      (storage_name, )] = STORAGE_USERS_LIMIT_CRITICAL

            for storage_name in opts.options.storage:
                storage_changed_quota = [
                    mkVscUserSizeQuota(q) for q in client.quota.user.
                    storage[storage_name].modified[last_timestamp].get()[1]
                ]
                storage_changed_quota = [
                    q for q in storage_changed_quota
                    if q.fileset.startswith('vsc')
                ]
                logging.info(
                    "Found %d accounts that have changed quota on storage %s in the accountpage since %s",
                    len(storage_changed_quota), storage_name, last_timestamp)
                (quota_ok, quota_fail) = process_users_quota(
                    opts.options, storage_changed_quota, storage_name, client,
                    institute)
                stats["%s_quota_sync" % (storage_name, )] = len(quota_ok)
                stats["%s_quota_sync_fail" %
                      (storage_name, )] = len(quota_fail)
                stats["%s_quota_sync_fail_warning" %
                      (storage_name, )] = STORAGE_QUOTA_LIMIT_WARNING
                stats["%s_quota_sync_fail_critical" %
                      (storage_name, )] = STORAGE_QUOTA_LIMIT_CRITICAL

        (vos_ok, vos_fail) = ([], [])
        if opts.options.vo:
            changed_vos = client.vo.institute[institute].modified[
                last_timestamp].get()[1]
            changed_vo_quota = client.quota.vo.modified[last_timestamp].get(
            )[1]

            vos = sorted(
                set([v['vsc_id'] for v in changed_vos] +
                    [v['virtual_organisation'] for v in changed_vo_quota]))

            logging.info(
                "Found %d %s VOs that have changed in the accountpage since %s"
                % (len(changed_vos), institute, last_timestamp))
            logging.info(
                "Found %d %s VOs that have changed quota in the accountpage since %s"
                % (len(changed_vo_quota), institute, last_timestamp))
            logging.debug("Found the following {institute} VOs: {vos}".format(
                institute=institute, vos=vos))

            for storage_name in opts.options.storage:
                (vos_ok, vos_fail) = process_vos(opts.options, vos,
                                                 storage_name, client,
                                                 last_timestamp, institute)
                stats["%s_vos_sync" % (storage_name, )] = len(vos_ok)
                stats["%s_vos_sync_fail" % (storage_name, )] = len(vos_fail)
                stats["%s_vos_sync_fail_warning" %
                      (storage_name, )] = STORAGE_VO_LIMIT_WARNING
                stats["%s_vos_sync_fail_critical" %
                      (storage_name, )] = STORAGE_VO_LIMIT_CRITICAL

        if not (users_fail or quota_fail
                or vos_fail) and not opts.options.dry_run:
            (_, ldap_timestamp) = convert_timestamp(start_time)
            write_timestamp(SYNC_TIMESTAMP_FILENAME, ldap_timestamp)
    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    opts.epilogue("%s users and VOs synchronised" % institute, stats)

예제 #11

0

파일 보기

파일: dquota.py 프로젝트: kwaegema/vsc-filesystems-quota

def main():
    """Main script"""

    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'storage': ('the VSC filesystems that are checked by this script',
                    None, 'extend', []),
        'write-cache': ('Write the data into the cache files in the FS', None,
                        'store_true', False),
        'account_page_url': ('Base URL of the account page', None, 'store',
                             'https://account.vscentrum.be/django'),
        'access_token': ('OAuth2 token to access the account page REST API',
                         None, 'store', None),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }
    opts = ExtendedSimpleOption(options)
    logger = opts.log

    try:
        client = AccountpageClient(token=opts.options.access_token)

        user_id_map = map_uids_to_names()  # is this really necessary?
        gpfs = GpfsOperations()
        storage = VscStorage()

        target_filesystems = [
            storage[s].filesystem for s in opts.options.storage
        ]

        filesystems = gpfs.list_filesystems(device=target_filesystems).keys()
        logger.debug("Found the following GPFS filesystems: %s" %
                     (filesystems))

        filesets = gpfs.list_filesets(devices=target_filesystems)
        logger.debug("Found the following GPFS filesets: %s" % (filesets))

        quota = gpfs.list_quota(devices=target_filesystems)
        exceeding_filesets = {}
        exceeding_users = {}
        stats = {}

        for storage_name in opts.options.storage:

            logger.info("Processing quota for storage_name %s" %
                        (storage_name))
            filesystem = storage[storage_name].filesystem
            replication_factor = storage[storage_name].data_replication_factor

            if filesystem not in filesystems:
                logger.error("Non-existent filesystem %s" % (filesystem))
                continue

            if filesystem not in quota.keys():
                logger.error("No quota defined for storage_name %s [%s]" %
                             (storage_name, filesystem))
                continue

            quota_storage_map = get_mmrepquota_maps(
                quota[filesystem],
                storage_name,
                filesystem,
                filesets,
                replication_factor,
            )

            exceeding_filesets[storage_name] = process_fileset_quota(
                storage,
                gpfs,
                storage_name,
                filesystem,
                quota_storage_map['FILESET'],
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            exceeding_users[storage_name] = process_user_quota(
                storage,
                gpfs,
                storage_name,
                None,
                quota_storage_map['USR'],
                user_id_map,
                client,
                dry_run=opts.options.dry_run,
                institute=opts.options.host_institute)

            stats["%s_fileset_critical" %
                  (storage_name, )] = QUOTA_FILESETS_CRITICAL
            if exceeding_filesets[storage_name]:
                stats["%s_fileset" % (storage_name, )] = 1
                logger.warning(
                    "storage_name %s found %d filesets that are exceeding their quota",
                    storage_name, len(exceeding_filesets))
                for (e_fileset, e_quota) in exceeding_filesets[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_fileset, str(e_quota)))
            else:
                stats["%s_fileset" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no filesets that are exceeding their quota"
                    % storage_name)

            stats["%s_users_warning" % (storage_name, )] = QUOTA_USERS_WARNING
            stats["%s_users_critical" %
                  (storage_name, )] = QUOTA_USERS_CRITICAL
            if exceeding_users[storage_name]:
                stats["%s_users" % (storage_name, )] = len(
                    exceeding_users[storage_name])
                logger.warning(
                    "storage_name %s found %d users who are exceeding their quota"
                    % (storage_name, len(exceeding_users[storage_name])))
                for (e_user_id, e_quota) in exceeding_users[storage_name]:
                    logger.warning("%s has quota %s" %
                                   (e_user_id, str(e_quota)))
            else:
                stats["%s_users" % (storage_name, )] = 0
                logger.debug(
                    "storage_name %s found no users who are exceeding their quota"
                    % storage_name)

    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")

    opts.epilogue("quota check completed", stats)

예제 #12

0

파일 보기

파일: inode_log.py 프로젝트: wdpypere/vsc-filesystems-quota

def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
    }

    opts = ExtendedSimpleOption(options)
    logger = opts.log

    stats = {}

    try:
        gpfs = GpfsOperations()
        filesets = gpfs.list_filesets()
        quota = gpfs.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" %
                  (filesystem, )] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "gpfs_inodes_%s_%s.gz" % (
                    time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem, )] = 0
                logger.info("Stored inodes information for FS %s" %
                            (filesystem))

                cfs = process_inodes_information(filesets[filesystem],
                                                 quota[filesystem]['FILESET'],
                                                 threshold=0.9)
                logger.info("Processed inodes information for filesystem %s" %
                            (filesystem, ))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info(
                        "Filesystem %s has at least %d filesets reaching the limit"
                        % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem, )] = 1
                logger.exception(
                    "Failed storing inodes information for FS %s" %
                    (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets, ))

        if critical_filesets:
            mail_admins(critical_filesets, opts.options.dry_run)

    except Exception:
        logger.exception("Failure obtaining GPFS inodes")
        opts.critical("Failure to obtain GPFS inodes information")

    opts.epilogue("Logged GPFS inodes", stats)

예제 #13

0

파일 보기

파일: sync_slurm_acct.py 프로젝트: wpoely86/vsc-administration

def main():
    """
    Main script. The usual.
    """

    options = {
        "nagios-check-interval-threshold":
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        "access_token": ("OAuth2 token to access the account page REST API",
                         None, "store", None),
        "account_page_url": (
            "URL of the account page where we can find the REST API",
            str,
            "store",
            "https://apivsc.ugent.be/django",
        ),
        "clusters": (
            "Cluster(s) (comma-separated) to sync for. "
            "Overrides GENT_SLURM_COMPUTE_CLUSTERS that are in production.",
            str,
            "store",
            None,
        ),
    }

    opts = ExtendedSimpleOption(options)
    stats = {}

    try:
        client = AccountpageClient(token=opts.options.access_token,
                                   url=opts.options.account_page_url + "/api/")

        last_timestamp = "201804010000Z"  # the beginning of time

        logging.info("Last recorded timestamp was %s" % (last_timestamp))

        slurm_account_info = get_slurm_acct_info(SyncTypes.accounts)
        slurm_user_info = get_slurm_acct_info(SyncTypes.users)

        logging.debug("%d accounts found", len(slurm_account_info))
        logging.debug("%d users found", len(slurm_user_info))

        if opts.options.clusters is not None:
            clusters = opts.options.clusters.split(",")
        else:
            clusters = [
                c for c in GENT_SLURM_COMPUTE_CLUSTERS
                if c in GENT_PRODUCTION_COMPUTE_CLUSTERS
            ]

        sacctmgr_commands = []

        # make sure the institutes and the default accounts (VOs) are there for each cluster
        sacctmgr_commands += slurm_institute_accounts(slurm_account_info,
                                                      clusters)

        # All users belong to a VO, so fetching the VOs is necessary/
        account_page_vos = [mkVo(v) for v in client.vo.get()[1]]

        # The VOs do not track active state of users, so we need to fetch all accounts as well
        active_accounts = set(
            [a["vsc_id"] for a in client.account.get()[1] if a["isactive"]])

        # dictionary mapping the VO vsc_id on a tuple with the VO members and the VO itself
        account_page_members = dict([(vo.vsc_id, (set(vo.members), vo))
                                     for vo in account_page_vos])

        # process all regular VOs
        sacctmgr_commands += slurm_vo_accounts(account_page_vos,
                                               slurm_account_info, clusters)

        # process VO members
        sacctmgr_commands += slurm_user_accounts(account_page_members,
                                                 active_accounts,
                                                 slurm_user_info, clusters,
                                                 opts.options.dry_run)

        logging.info("Executing %d commands", len(sacctmgr_commands))

        if opts.options.dry_run:
            print("Commands to be executed:\n")
            print("\n".join([" ".join(c) for c in sacctmgr_commands]))
        else:
            execute_commands(sacctmgr_commands)

    except Exception as err:
        logger.exception("critical exception caught: %s" % (err))
        opts.critical("Script failed in a horrible way")
        sys.exit(NAGIOS_EXIT_CRITICAL)

    if not opts.options.dry_run:
        opts.epilogue("Accounts synced to slurm", stats)
    else:
        logger.info("Dry run done")

예제 #14

0

파일 보기

파일: inode_log.py 프로젝트: stdweird/vsc-filesystems-quota

def main():
    """The main."""

    # Note: debug option is provided by generaloption
    # Note: other settings, e.g., ofr each cluster will be obtained from the configuration file
    options = {
        'nagios-check-interval-threshold':
        NAGIOS_CHECK_INTERVAL_THRESHOLD,
        'location':
        ('path to store the gzipped files', None, 'store', INODE_LOG_ZIP_PATH),
        'backend': ('Storage backend', None, 'store', 'gpfs'),
        'host_institute':
        ('Name of the institute where this script is being run', str, 'store',
         GENT),
    }

    opts = ExtendedSimpleOption(options)
    logger = opts.log

    stats = {}

    backend = opts.options.backend
    try:
        if backend == 'gpfs':
            storage_backend = GpfsOperations()
        elif backend == 'lustre':
            storage_backend = LustreOperations()
        else:
            logger.exception("Backend %s not supported" % backend)

        filesets = storage_backend.list_filesets()
        quota = storage_backend.list_quota()

        if not os.path.exists(opts.options.location):
            os.makedirs(opts.options.location, 0o755)

        critical_filesets = dict()

        for filesystem in filesets:
            stats["%s_inodes_log_critical" %
                  (filesystem, )] = INODE_STORE_LOG_CRITICAL
            try:
                filename = "%s_inodes_%s_%s.gz" % (
                    backend, time.strftime("%Y%m%d-%H:%M"), filesystem)
                path = os.path.join(opts.options.location, filename)
                zipfile = gzip.open(path, 'wb', 9)  # Compress to the max
                zipfile.write(json.dumps(filesets[filesystem]))
                zipfile.close()
                stats["%s_inodes_log" % (filesystem, )] = 0
                logger.info("Stored inodes information for FS %s" %
                            (filesystem))

                cfs = process_inodes_information(filesets[filesystem],
                                                 quota[filesystem]['FILESET'],
                                                 threshold=0.9,
                                                 storage=backend)
                logger.info("Processed inodes information for filesystem %s" %
                            (filesystem, ))
                if cfs:
                    critical_filesets[filesystem] = cfs
                    logger.info(
                        "Filesystem %s has at least %d filesets reaching the limit"
                        % (filesystem, len(cfs)))

            except Exception:
                stats["%s_inodes_log" % (filesystem, )] = 1
                logger.exception(
                    "Failed storing inodes information for FS %s" %
                    (filesystem))

        logger.info("Critical filesets: %s" % (critical_filesets, ))

        if critical_filesets:
            mail_admins(critical_filesets,
                        dry_run=opts.options.dry_run,
                        host_institute=opts.options.host_institute)

    except Exception:
        logger.exception("Failure obtaining %s inodes" % backend)
        opts.critical("Failure to obtain %s inodes information" % backend)

    opts.epilogue("Logged %s inodes" % backend, stats)