Пример #1
0
    def setUpClass(cls):
        config_dict = Config(CONFIGURATION)
        server_config_dict = Config(SERVER_CONFIGURATION)
        secrets_dict = Config(SECRETS)
        cls.config = config_dict
        cls.secrets = secrets_dict

        cls.glidein_site = config_dict['Glidein']['site']
        cls.minio_url = config_dict['StartdLogging']['url']
        cls.minio_bucket = config_dict['StartdLogging']['bucket']
        cls.minio_acces_key = secrets_dict['StartdLogging']['access_key']
        cls.minio_secret_key = secrets_dict['StartdLogging']['secret_key']
        cls.minio_secure = True
        cls.pyglidein_client_name = 'pyglidein-client'
        cls.metrics_graphite_server = server_config_dict['metrics']['graphite_server']
        cls.metrics_namespace = server_config_dict['metrics']['namespace']

        cls.tmpdir = tempfile.mkdtemp()
Пример #2
0
def main():
    parser = OptionParser()
    parser.add_option('--config',
                      type='string',
                      default='cluster.config',
                      help="config file for cluster")
    parser.add_option('--secrets',
                      type='string',
                      default='.pyglidein_secrets',
                      help="secrets file for cluster")
    parser.add_option('--uuid',
                      type='string',
                      default=getpass.getuser() + '@' + socket.gethostname(),
                      help="Unique id for this client")
    (options, args) = parser.parse_args()

    config_dict = Config(options.config)
    config_glidein = config_dict['Glidein']
    config_cluster = config_dict['Cluster']
    if 'StartdLogging' in config_dict:
        config_startd_logging = config_dict['StartdLogging']
    else:
        config_startd_logging = {}

    if ('Mode' in config_dict and 'debug' in config_dict['Mode']
            and config_dict['Mode']['debug']):
        logging.basicConfig(level=logging.DEBUG,
                            format='%(asctime)s %(levelname)s %(message)s')
    else:
        logging.basicConfig(level=logging.INFO,
                            format='%(asctime)s %(levelname)s %(message)s')

    # Loading secrets.  Fail if permissions wrong.
    if os.path.isfile(options.secrets):
        if os.stat(options.secrets).st_mode & (stat.S_IXGRP | stat.S_IRWXO):
            logger.error('Set Permissions on {} to 600'.format(
                options.secrets))
            sys.exit(1)
        secrets_dict = Config(options.secrets)
        if 'StartdLogging' in secrets_dict:
            secrets_startd_logging = secrets_dict['StartdLogging']
        else:
            secrets_startd_logging = {}
    else:
        logger.error(
            'Error Accessing Secrets File: {}.  '.format(options.secrets) +
            'Did you set the --secrets flag?')
        sys.exit(1)

    # Importing the correct class to handle the submit
    sched_type = config_cluster["scheduler"].lower()
    if sched_type == "htcondor":
        scheduler = submit.SubmitCondor(config_dict, secrets_dict)
        metrics = client_metrics.ClientMetricsCondor(config_dict, secrets_dict)
    elif sched_type == "pbs":
        scheduler = submit.SubmitPBS(config_dict, secrets_dict)
        metrics = client_metrics.ClientMetricsPBS(config_dict, secrets_dict)
    elif sched_type == "slurm":
        scheduler = submit.SubmitSLURM(config_dict, secrets_dict)
        metrics = client_metrics.ClientMetricsSlurm(config_dict, secrets_dict)
    elif sched_type == "uge":
        scheduler = submit.SubmitUGE(config_dict, secrets_dict)
        metrics = client_metrics.ClientMetricsPBS(config_dict, secrets_dict)
    elif sched_type == "lsf":
        scheduler = submit.SubmitLSF(config_dict, secrets_dict)
        metrics = client_metrics.ClientMetricsLSF(config_dict, secrets_dict)
    else:
        raise Exception('scheduler not supported')

    # if "glidein_cmd" not in config_dict["Glidein"]:
    #     raise Exception('no glidein_cmd')

    # Failing if startd logging is enabled and python version < 2.7
    if ('send_startd_logs' in config_startd_logging
            and config_startd_logging['send_startd_logs'] is True
            and sys.version_info < (2, 7)):
        logger.error('Python version must be > 2.7 to enable startd logging.')
        sys.exit(1)
    # Checking on startd logging configuration if enabled
    if ('send_startd_logs' in config_startd_logging
            and config_startd_logging['send_startd_logs'] is True):
        for config_val in ['url', 'bucket']:
            if config_val not in config_startd_logging:
                logger.error(
                    'Missing %s configuration value in StartdLogging Section' %
                    config_val)
                sys.exit(1)
        for secret_val in ['access_key', 'secret_key']:
            if secret_val not in secrets_startd_logging:
                logger.error(
                    'Missing %s secret value in StartdLogging Section' %
                    secret_val)
                sys.exit(1)

    while True:
        if 'ssh_state' in config_glidein and config_glidein['ssh_state']:
            state = get_ssh_state()
        else:
            state = get_state(config_glidein['address'])
        if 'uuid' in config_glidein:
            options.uuid = config_glidein['uuid']
        info = {
            'uuid': options.uuid,
            'glideins_idle': dict(),
            'glideins_running': dict(),
            'glideins_launched': dict(),
        }
        metrics_bundle = client_metrics.ClientMetricsBundle(options.uuid)
        if state:
            for partition in config_dict['Cluster'].get(
                    'partitions', ['Cluster']):
                config_cluster = config_dict[partition]
                if "running_cmd" not in config_cluster:
                    raise Exception('Section [%s] has no running_cmd' %
                                    partition)
                idle = 0
                try:
                    info['glideins_running'][partition] = get_running(
                        config_cluster["running_cmd"])
                    metrics_bundle.update_metric(
                        'glideins_running', partition,
                        info['glideins_running'][partition])
                    if "idle_cmd" in config_cluster:
                        idle = get_running(config_cluster["idle_cmd"])
                        info['glideins_idle'][partition] = idle
                        metrics_bundle.update_metric(
                            'glideins_idle', partition,
                            info['glideins_idle'][partition])
                except Exception:
                    logger.warn('error getting running job count',
                                exc_info=True)
                    continue
                info['glideins_launched'][partition] = 0
                limit = min(
                    config_cluster["limit_per_submit"],
                    config_cluster["max_total_jobs"] -
                    info['glideins_running'][partition],
                    max(config_cluster.get("max_idle_jobs", 1000) - idle, 0))
                # Prioitize job submission. By default, prioritize submission of gpu and high memory jobs.
                state = sort_states(state, config_cluster["prioritize_jobs"])
                for s in state:
                    if sched_type == "pbs":
                        s["memory"] = s["memory"] * 1024 / 1000
                    if limit <= 0:
                        logger.info('reached limit')
                        break
                    # Skipping CPU jobs for gpu only clusters
                    if ('gpu_only' in config_cluster
                            and config_cluster['gpu_only'] and s["gpus"] == 0):
                        continue
                    # skipping GPU jobs for cpu only clusters
                    if ('cpu_only' in config_cluster
                            and config_cluster['cpu_only'] and s["gpus"] != 0):
                        continue
                    # skipping jobs over cluster resource limits
                    if config_cluster['whole_node']:
                        prefix = 'whole_node_%s'
                    else:
                        prefix = 'max_%s_per_job'
                    for resource in ('cpus', 'gpus', 'memory', 'disk'):
                        cfg_name = prefix % resource
                        if (cfg_name in config_cluster
                                and s[resource] > config_cluster[cfg_name]):
                            break
                        cfg_name = 'min_%s_per_job' % resource
                        if (cfg_name in config_cluster
                                and s[resource] < config_cluster[cfg_name]):
                            break
                    else:
                        if "count" in s and s["count"] > limit:
                            s["count"] = limit
                        scheduler.submit(s, partition)
                        num = 1 if "count" not in s else s["count"]
                        limit -= num
                        info['glideins_launched'][partition] += num
                metrics_bundle.update_metric(
                    'glideins_launched', partition,
                    info['glideins_launched'][partition])
                logger.info('launched %d glideins on %s',
                            info['glideins_launched'][partition], partition)
        else:
            logger.info('no state, nothing to do')

        metrics_bundle.update_metrics(metrics.get_mma_idle_time())
        metrics.send(metrics_bundle)

        if 'delay' not in config_glidein or int(config_glidein['delay']) < 1:
            break
        time.sleep(config_glidein['delay'])
    for partition in config_dict['Cluster'].get('partitions', ['Cluster']):
        config_cluster = config_dict[partition]
        if "cleanup" in config_cluster and config_cluster["cleanup"]:
            scheduler.cleanup(config_cluster["running_cmd"],
                              config_cluster["dir_cleanup"])
Пример #3
0
def main():
    parser = OptionParser()
    parser.add_option('-p',
                      '--port',
                      type='int',
                      default=11001,
                      help='Port to serve from (default: 11001)')
    parser.add_option('-u',
                      '--user',
                      type='string',
                      default=None,
                      help='Only track a single user')
    parser.add_option('--constraint',
                      type='string',
                      default=None,
                      help='HTCondor constraint expression')
    parser.add_option(
        '--delay',
        type='int',
        default=300,
        help='delay between calls to condor_q (default: 300 seconds)')
    parser.add_option('--debug',
                      action='store_true',
                      default=False,
                      help='Enable debug logging')
    parser.add_option('--config',
                      type='string',
                      default='pyglidein_server.config',
                      help="config file for cluster")
    (options, args) = parser.parse_args()

    config = Config(options.config)

    logformat = '%(asctime)s %(levelname)s %(name)s : %(message)s'
    if options.debug:
        logging.basicConfig(level=logging.DEBUG, format=logformat)
    else:
        logging.basicConfig(level=logging.INFO, format=logformat)

    if options.delay < 0 or options.delay > 1000:
        raise Exception('delay out of range')

    if config.get('metrics', {}).get('enable_metrics', False):
        metrics_sender_client = MetricsSenderClient(config['metrics'])
    else:
        metrics_sender_client = None

    cfg = {
        'options': options,
        'config': config,
        'condor_q': False,
        'state': [],
        'monitoring': {},
        'metrics_sender_client': metrics_sender_client
    }

    # load condor_q
    IOLoop.instance().call_later(5, partial(condor_q_helper, cfg))

    # setup server
    s = server(cfg)
    s.start()
Пример #4
0
def main():
    parser = OptionParser()
    parser.add_option('-p', '--port', type='int', default=11001,
                      help='Port to serve from (default: 11001)')
    parser.add_option('-u', '--user', type='string', default=None,
                      help='Only track a single user')
    parser.add_option('--constraint', type='string', default=None,
                      help='HTCondor constraint expression')
    parser.add_option('--delay', type='int', default=300,
                      help='delay between calls to condor_q (default: 300 seconds)')
    parser.add_option('--debug', action='store_true', default=False,
                      help='Enable debug logging')
    parser.add_option('--config', type='string', default='pyglidein_server.config',
                      help="config file for cluster")
    parser.add_option('-n','--no-daemon',dest='daemon',default=True,action='store_false',help='do not daemonize')
    parser.add_option('--logfile',default='log',help='filename for logging (daemon mode)')
    (options, args) = parser.parse_args()

    config = Config(options.config)

    logformat = '%(asctime)s %(levelname)s %(name)s : %(message)s'
    kwargs = {
        'format': logformat,
        'level': logging.DEBUG if options.debug else logging.INFO,
    }
    
    if options.daemon:
        kwargs['filename'] = options.logfile

    if options.delay < 0 or options.delay > 1000:
        raise Exception('delay out of range')
        
    if config.get('metrics', {}).get('enable_metrics', False):
        metrics_sender_client = MetricsSenderClient(config['metrics'])
    else:
        metrics_sender_client = None

    cfg = {'options': options, 'config': config, 'condor_q': False, 'state': [], 'monitoring': {},
           'metrics_sender_client': metrics_sender_client}
    
    def starter():
        logging.basicConfig(**kwargs)

        # load condor_q
        IOLoop.instance().call_later(5, partial(condor_q_helper, cfg))

        # setup server
        s = server(cfg)
        s.start()

    if options.daemon:
        from pyglidein.daemon import Daemon
        pid = '/tmp/authorlist.pid'
        d = Daemon(pidfile=pid, chdir=os.getcwd(),
                   runner=starter)
        action = args[0] if args else None
        if (not action) or action == 'start':
            d.start()
        elif action == 'stop':
            d.stop()
        elif action == 'restart':
            d.restart()
        elif action == 'kill':
            d.kill()
        else:
            raise Exception('unknown action')
    else:
        starter()
Пример #5
0
def main():
    parser = OptionParser()
    parser.add_option('-p', '--port', type='int', default=11001,
                      help='Port to serve from (default: 11001)')
    parser.add_option('-u', '--user', type='string', default=None,
                      help='Only track a single user')
    parser.add_option('--constraint', type='string', default=None,
                      help='HTCondor constraint expression')
    parser.add_option('--delay', type='int', default=300,
                      help='delay between calls to condor_q (default: 300 seconds)')
    parser.add_option('--debug', action='store_true', default=False,
                      help='Enable debug logging')
    parser.add_option('--config', type='string', default='pyglidein_server.config',
                      help="config file for cluster")
    parser.add_option('-n','--no-daemon',dest='daemon',default=True,action='store_false',help='do not daemonize')
    parser.add_option('--logfile',default='log',help='filename for logging (daemon mode)')
    (options, args) = parser.parse_args()

    config = Config(options.config)

    logformat = '%(asctime)s %(levelname)s %(name)s : %(message)s'
    kwargs = {
        'format': logformat,
        'level': logging.DEBUG if options.debug else logging.INFO,
    }
    
    if options.daemon:
        kwargs['filename'] = options.logfile

    if options.delay < 0 or options.delay > 1000:
        raise Exception('delay out of range')
        
    if config.get('metrics', {}).get('enable_metrics', False):
        metrics_sender_client = MetricsSenderClient(config['metrics'])
    else:
        metrics_sender_client = None

    cfg = {'options': options, 'config': config, 'condor_q': False, 'state': [], 'monitoring': {},
           'metrics_sender_client': metrics_sender_client}
    
    def starter():
        logging.basicConfig(**kwargs)

        # load condor_q
        IOLoop.current().call_later(5, partial(condor_q, cfg))

        # setup server
        s = server(cfg)
        s.start()

    if options.daemon:
        from pyglidein.daemon import Daemon
        pid = '/tmp/authorlist.pid'
        d = Daemon(pidfile=pid, chdir=os.getcwd(),
                   runner=starter)
        action = args[0] if args else None
        if (not action) or action == 'start':
            d.start()
        elif action == 'stop':
            d.stop()
        elif action == 'restart':
            d.restart()
        elif action == 'kill':
            d.kill()
        else:
            raise Exception('unknown action')
    else:
        starter()