Exemplo n.º 1
0
def control(queues, traces, args):
    """
    Main control function, run from the relevant workflow module.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    t0 = time.time()
    traces.pilot['lifetime_start'] = t0  # ie referring to when pilot monitoring began
    traces.pilot['lifetime_max'] = t0

    threadchecktime = int(config.Pilot.thread_check)

    # for CPU usage debugging
    cpuchecktime = int(config.Pilot.cpu_check)
    tcpu = t0

    queuedata = get_queuedata_from_job(queues)
    max_running_time = get_max_running_time(args.lifetime, queuedata)

    try:
        # overall loop counter (ignoring the fact that more than one job may be running)
        n = 0

        while not args.graceful_stop.is_set():
            # every seconds, run the monitoring checks
            if args.graceful_stop.wait(1) or args.graceful_stop.is_set():  # 'or' added for 2.6 compatibility
                logger.warning('aborting monitor loop since graceful_stop has been set')
                break

            # abort if kill signal arrived too long time ago, ie loop is stuck
            if args.kill_time and int(time.time()) - args.kill_time > MAX_KILL_WAIT_TIME:
                logger.warning('loop has run for too long time - will abort')
                args.graceful_stop.set()
                break

            # check if the pilot has run out of time (stop ten minutes before PQ limit)
            time_since_start = get_time_since_start(args)
            grace_time = 10 * 60
            if time_since_start - grace_time > max_running_time:
                logger.fatal('max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot' %
                             (max_running_time, grace_time))
                logger.info('setting REACHED_MAXTIME and graceful stop')
                environ['REACHED_MAXTIME'] = 'REACHED_MAXTIME'  # TODO: use singleton instead
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is FINAL_DONE
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                break
            else:
                if n % 60 == 0:
                    logger.info('%d s have passed since pilot start' % time_since_start)
            time.sleep(1)

            # time to check the CPU?
            if int(time.time() - tcpu) > cpuchecktime and False:  # for testing only
                processes = get_process_info('python pilot2/pilot.py', pid=getpid())
                if processes:
                    logger.info('-' * 100)
                    logger.info('PID=%d has CPU usage=%s%% MEM usage=%s%% CMD=%s' % (getpid(), processes[0], processes[1], processes[2]))
                    n = processes[3]
                    if n > 1:
                        logger.info('there are %d such processes running' % n)
                    else:
                        logger.info('there is %d such process running' % n)
                    logger.info('-' * 100)
                tcpu = time.time()

            # proceed with running the other checks
            run_checks(queues, args)

            # thread monitoring
            if int(time.time() - traces.pilot['lifetime_start']) % threadchecktime == 0:
                # get all threads
                for thread in threading.enumerate():
                    # logger.info('thread name: %s' % thread.name)
                    if not thread.is_alive():
                        logger.fatal('thread \'%s\' is not alive' % thread.name)
                        # args.graceful_stop.set()

            n += 1

    except Exception as e:
        print(("monitor: exception caught: %s" % e))
        raise PilotException(e)

    logger.info('[monitor] control thread has ended')
Exemplo n.º 2
0
def control(queues, traces, args):
    """
    Main control function, run from the relevant workflow module.

    :param queues:
    :param traces:
    :param args:
    :return:
    """

    traces.pilot['lifetime_start'] = time.time(
    )  # ie referring to when pilot monitoring begain
    traces.pilot['lifetime_max'] = time.time()

    threadchecktime = int(config.Pilot.thread_check)

    queuedata = get_queuedata_from_job(queues)
    if queuedata:
        logger.debug('extracted queuedata from job object')
    else:
        logger.debug('failed to extract queuedata from job object')
    max_running_time = get_max_running_time(args.lifetime, queuedata)

    try:
        # overall loop counter (ignoring the fact that more than one job may be running)
        n = 0

        while not args.graceful_stop.is_set():
            # every seconds, run the monitoring checks
            if args.graceful_stop.wait(1) or args.graceful_stop.is_set(
            ):  # 'or' added for 2.6 compatibility
                break

            # check if the pilot has run out of time (stop ten minutes before PQ limit)
            time_since_start = get_time_since_start(args)
            grace_time = 10 * 60
            if time_since_start - grace_time > max_running_time:
                logger.fatal(
                    'max running time (%d s) minus grace time (%d s) has been exceeded - must abort pilot'
                    % (max_running_time, grace_time))
                logger.info('setting REACHED_MAXTIME and graceful stop')
                environ[
                    'REACHED_MAXTIME'] = 'REACHED_MAXTIME'  # TODO: use singleton instead
                # do not set graceful stop if pilot has not finished sending the final job update
                # i.e. wait until SERVER_UPDATE is FINAL_DONE
                check_for_final_server_update(args.update_server)
                args.graceful_stop.set()
                break
            else:
                if n % 60 == 0:
                    logger.info('%d s have passed since pilot start' %
                                time_since_start)
            time.sleep(1)

            # proceed with running the checks
            run_checks(queues, args)

            # thread monitoring
            if int(time.time() -
                   traces.pilot['lifetime_start']) % threadchecktime == 0:
                # get all threads
                for thread in threading.enumerate():
                    # logger.info('thread name: %s' % thread.name)
                    if not thread.is_alive():
                        logger.fatal('thread \'%s\' is not alive' %
                                     thread.name)
                        # args.graceful_stop.set()

            n += 1

    except Exception as e:
        print("monitor: exception caught: %s" % e)
        raise PilotException(e)

    logger.info('[monitor] control thread has ended')