Пример #1
0
def main():
    """
    The entrypoint for the zoe-observer script.
    :return: int
    """
    load_configuration()
    args = get_conf()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG)

    else:
        logging.basicConfig(level=logging.INFO)

    logging.getLogger('kazoo').setLevel(logging.WARNING)
    logging.getLogger('requests').setLevel(logging.WARNING)
    logging.getLogger('docker').setLevel(logging.INFO)

    swarm = SwarmClient(args)

    while True:
        try:
            zoe_containers = swarm.list('zoe.{}'.format(get_conf().container_name_prefix))
            for c in zoe_containers:
                if 'Exited' in c['status']:
                    zoe_id = c['labels']['zoe.container_id']
                    container_died(zoe_id)

            check_guests(swarm)

            time.sleep(get_conf().loop_time)

        except KeyboardInterrupt:
            break
        except Exception:
            log.exception('Something bad happened')
Пример #2
0
def guest_check_thread(args):
    swarm = SwarmClient(args)

    while True:
        try:
            zoe_containers = swarm.list(
                {'zoe.deployment_name': get_conf().deployment_name})
            for c in zoe_containers:
                if 'Exited' in c['status']:
                    zoe_id = c['labels']['zoe.service.id']
                    try:
                        container_died(zoe_id)
                    except ZoeAPIException:
                        log.warning(
                            'Container ' + c['name'] +
                            ' has died, but Zoe does not know anything about it, deleting'
                        )
                        swarm.terminate_container(c['id'], delete=True)

            check_guests(swarm)

            time.sleep(get_conf().loop_time)

        except Exception:
            log.exception('Something bad happened')
Пример #3
0
def container_died(service_id):
    log.debug('A container died')
    # tell the master via the rest api
    cont_api = ZoeServiceAPI(get_conf().master_url, 'zoeadmin', get_conf().zoeadmin_password)
    try:
        cont_api.died(service_id)
    except ZoeAPIException as e:
        if e.message != "No such service":
            log.exception('Error reporting a dead service')
Пример #4
0
def container_died(service_id):
    log.debug('A container died')
    # tell the master via the rest api
    cont_api = ZoeServiceAPI(get_conf().master_url, 'zoeadmin',
                             get_conf().zoeadmin_password)
    try:
        cont_api.died(service_id)
    except ZoeAPIException as e:
        if e.message != "No such service":
            log.exception('Error reporting a dead service')
Пример #5
0
def guest_check_thread(args):
    swarm = SwarmClient(args)

    while True:
        try:
            zoe_containers = swarm.list({'zoe.deployment_name': get_conf().deployment_name})
            for c in zoe_containers:
                if 'Exited' in c['status']:
                    zoe_id = c['labels']['zoe.service.id']
                    try:
                        container_died(zoe_id)
                    except ZoeAPIException:
                        log.warning('Container ' + c['name'] + ' has died, but Zoe does not know anything about it, deleting')
                        swarm.terminate_container(c['id'], delete=True)

            check_guests(swarm)

            time.sleep(get_conf().loop_time)

        except Exception:
            log.exception('Something bad happened')
Пример #6
0
def check_guests(swarm):
    query_api = ZoeQueryAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password)
    exec_api = ZoeExecutionsAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password)
    cont_api = ZoeContainerAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password)

    guests = query_api.query('user', role='guest')
    execs = exec_api.list()
    for guest in guests:
        my_execs = [e for e in execs if e['owner'] == guest['id']]
        for my_exec in my_execs:
            if len(my_exec['containers']) == 0:
                continue
            my_exec_since_started = datetime.datetime.now() - dateutil.parser.parse(my_exec['time_started'])
            my_exec_since_started = my_exec_since_started.total_seconds()
            terminate = False
            for c in my_exec['containers']:
                c = cont_api.get(c)
                for port in c['ports']:
                    if port['name'] == 'Spark application web interface':
                        if check_spark_job(swarm, c['docker_id'], my_exec_since_started):
                            log.info('Execution {} for user {} has been idle for too long, terminating...'.format(my_exec['name'], guest['name']))
                            terminate = True
                            break
                    if terminate:
                        break
            if terminate:
                exec_api.terminate(my_exec['id'])
Пример #7
0
def main_callback(event):
    if event['Type'] != 'container':
        return

    try:
        if event['Actor']['Attributes']['zoe.deployment_name'] != get_conf().deployment_name:
            return
    except KeyError:
        return

    log.debug(event)

    if event['Action'] == "die":
        try:
            service_id = event['Actor']['Attributes']['zoe.service.id']
            container_died(service_id)
        except KeyError:
            return
Пример #8
0
def main_callback(event):
    if event['Type'] != 'container':
        return

    try:
        if event['Actor']['Attributes']['zoe.deployment_name'] != get_conf(
        ).deployment_name:
            return
    except KeyError:
        return

    log.debug(event)

    if event['Action'] == "die":
        try:
            service_id = event['Actor']['Attributes']['zoe.service.id']
            container_died(service_id)
        except KeyError:
            return
Пример #9
0
def main():
    """
    The entrypoint for the zoe-observer script.
    :return: int
    """
    load_configuration()
    args = get_conf()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
    else:
        logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)

    logging.getLogger('kazoo').setLevel(logging.WARNING)
    logging.getLogger('requests').setLevel(logging.WARNING)
    logging.getLogger('docker').setLevel(logging.INFO)

    th = threading.Thread(target=guest_check_thread, name="guest-check", args=[args], daemon=True)
    th.start()

    swarm_events_thread(args)
Пример #10
0
def main():
    """
    The entrypoint for the zoe-observer script.
    :return: int
    """
    load_configuration()
    args = get_conf()
    if args.debug:
        logging.basicConfig(level=logging.DEBUG, format=LOG_FORMAT)
    else:
        logging.basicConfig(level=logging.INFO, format=LOG_FORMAT)

    logging.getLogger('kazoo').setLevel(logging.WARNING)
    logging.getLogger('requests').setLevel(logging.WARNING)
    logging.getLogger('docker').setLevel(logging.INFO)

    th = threading.Thread(target=guest_check_thread,
                          name="guest-check",
                          args=[args],
                          daemon=True)
    th.start()

    swarm_events_thread(args)
Пример #11
0
def check_guests(swarm):
    query_api = ZoeQueryAPI(get_conf().master_url, 'zoeadmin',
                            get_conf().zoeadmin_password)
    exec_api = ZoeExecutionsAPI(get_conf().master_url, 'zoeadmin',
                                get_conf().zoeadmin_password)
    cont_api = ZoeServiceAPI(get_conf().master_url, 'zoeadmin',
                             get_conf().zoeadmin_password)

    guests = query_api.query('user', role='guest')
    execs = exec_api.list()
    for guest in guests:
        my_execs = [e for e in execs if e['owner'] == guest['name']]
        for my_exec in my_execs:
            if len(my_exec['services']) == 0:
                continue
            my_exec_since_started = datetime.datetime.now(
            ) - dateutil.parser.parse(my_exec['time_started'])
            my_exec_since_started = my_exec_since_started.total_seconds()
            terminate = False
            for c in my_exec['services']:
                c = cont_api.get(c)
                for port in c['ports']:
                    if port['name'] == 'Spark application web interface':
                        idle_time = check_spark_job(swarm, c['docker_id'],
                                                    my_exec_since_started)
                        if check_if_kill(idle_time):
                            log.info(
                                'Execution {} for user {} has been idle for too long, terminating...'
                                .format(my_exec['name'], guest['name']))
                            terminate = True
                            break
                        else:
                            log.debug(
                                'Execution {} for user {} has been idle for {} seconds'
                                .format(my_exec['name'], guest['name'],
                                        idle_time))
                    if terminate:
                        break
            if terminate:
                exec_api.terminate(my_exec['id'])
Пример #12
0
def check_if_kill(idle_seconds):
    if idle_seconds > get_conf().spark_activity_timeout:
        return True
    else:
        return False
Пример #13
0
def check_if_kill(idle_seconds):
    if idle_seconds > get_conf().spark_activity_timeout:
        return True
    else:
        return False
Пример #14
0
def container_died(zoe_id: int):
    log.debug('A container died')
    # tell the scheduler via the rest api
    cont_api = ZoeContainerAPI(get_conf().scheduler_url, 'zoeadmin', get_conf().zoeadmin_password)
    cont_api.died(zoe_id)