示例#1
0
def requeue_failed_jobs():
    failed_queue = get_failed_queue(connection=sjs.get_redis_conn())

    jobs_to_requeue = failed_queue.get_job_ids()
    for job_id in jobs_to_requeue:
        failed_queue.requeue(job_id)

    return jobs_to_requeue
示例#2
0
from time import sleep

import sjs

from sample_job import job_that_takes_a_long_time

filepath = sjs.DEFAULT_CONFIG_LOCATION
if len(sys.argv) > 1:
    filepath = sys.argv[1]

if not sjs.load(filepath):
    raise SystemExit()

sjs.run_pre_queue_checks(exit_on_fail=True)

redis_conn = sjs.get_redis_conn()
q = sjs.get_job_queue()

# enqueue sample jobs
jobs = []
jobs.append(q.enqueue(job_that_takes_a_long_time, 10))
jobs.append(q.enqueue(job_that_takes_a_long_time, 60))

# NOTE:
# Just because a job is queued, doesn't mean there are any workers to run it. If you are testing,
# you should go ahead and and start a worker with `rq worker`.

# WARNING: you would normally be done here!
# But for this test code, we're going to wait around, get the output and print.
# NOT AT ALL RECOMMENDED FOR PRODUCTION CODE
示例#3
0
def launch_workers(num_workers, burst, run_pre_checks, run_env_checks,
                   interval):
    os.makedirs("logs", exist_ok=True)

    if run_pre_checks:
        print("Running pre-checks...")
        sjs.run_pre_worker_checks(exit_on_fail=True)
        print("OK!")
    else:
        print("Skipping pre-checks!")

    working_dir = get_sjs_running_file()
    if not working_dir:
        raise SystemExit("Currently there is no run started (i.e. there is no %s file). " \
            "Are you in the correct directory?" % SJS_RUNNING_FILE)

    hostname = os.uname()[1]
    timestamp = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

    # compare env_record at start of run with this one
    env_record_dir = os.path.join(working_dir, 'env_records')
    env_record_path = os.path.join(env_record_dir,
                                   "%s_%s" % (hostname, timestamp))
    env = save_env_record(env_record_path)
    orig_env_record = read_env_record(
        os.path.join(env_record_dir, 'env_record_start.yaml'))
    if run_env_checks:
        print("Running env-checks...")
        if env != orig_env_record:
            print("env_record of this machine does not match env record of original machine! " \
                "Aborting launch workers! Please see %s to compare manually" % (env_record_path))
            raise SystemExit(
                "Env records do not match, aborting launch workers!")
        else:
            print("OK!")
    else:
        print("Skipping env-checks!")

    print("")
    print("Running on hostname %s" % hostname)
    print("Running at timestamp %s" % timestamp)
    print("Log name template: %s_%s_*.log" % (hostname, timestamp))
    print("Env record path: %s" % env_record_path)
    if burst:
        print("Running in burst mode. Workers and launch_workers script will exit when all " \
              "workers are idle and the queue is empty.")
    else:
        print(
            "Workers and launch_workers script will stay alive until killed.")

    print("")
    worker_processes = []
    log_files = []

    sjs.load()
    sjs_config = sjs.get_sjs_config()
    redis_cfg = sjs_config['redis']
    redis_url = "redis://%s:%s/%s" % (redis_cfg['host'], redis_cfg['port'],
                                      redis_cfg['db'])
    cmd = ['rq', 'worker', "-u", redis_url, sjs_config['queue']]

    for i in range(num_workers):
        logname = 'logs/%s_%s_%s.log' % (hostname, timestamp, i)
        print("Launching worker #%s with log file %s" % (i, logname))

        log = open(logname, 'w')
        proc = subprocess.Popen(cmd, stdout=log, stderr=log)

        worker_processes.append(proc)
        log_files.append(log)

    print("")
    print("Worker PIDS: %s" % [w.pid for w in worker_processes])

    try:
        conn = sjs.get_redis_conn()

        if 'min_seconds_per_job' in sjs_config or burst == False:
            # more complex case of either handling bursted workers, or handling min_seconds_per_job
            # timeout. Here we run a loop and check conditions each run through the loop.
            while True:
                sleep(interval)

                if burst:
                    # there is no point killing workers on the node unless all of them are idle and
                    # we can kill all the workers and release the node. So here we poll for the
                    # current worker state and if all the workers are idle AND the queue is empty,
                    # then we shut the node down.
                    workers = [
                        w for w in Worker.all(connection=conn)
                        if w.name.startswith(hostname)
                    ]
                    idle_workers = [w for w in workers if w.state == 'idle']
                    if len(idle_workers) == len(workers) and len(
                            sjs.get_job_queue()) == 0:
                        print("All workers idle; queue is empty.")
                        disable_signals()
                        raise SystemExit()

                if 'min_seconds_per_job' in sjs_config:
                    try:
                        results = subprocess.check_output(
                            "qstat -i $PBS_JOBID",
                            shell=True,
                            universal_newlines=True)
                        hours, minutes, seconds = results.strip().split(
                            "\n")[-1][-8:].split(":")
                        walltime_remaining = int(hours) * 3600 + int(
                            minutes) * 60 + int(seconds)

                        if sjs_config[
                                'min_seconds_per_job'] > walltime_remaining:
                            print("walltime remaining is less than the min seconds required per " \
                                  "job. Sending SIGINTs to workers so they exit when the " \
                                  "currently running job is complete")
                            for worker in worker_processes:
                                os.kill(worker.pid, signal.SIGINT)
                            break

                    except Exception as e:
                        print("Failure getting walltime", e)

        # the simplest case of just running the workers until they exit
        print("Waiting for workers to exit...")
        for w in worker_processes:
            w.wait()

    except SystemExit:
        # if this process is forced to exit, we kill the workers, and wait for them to
        # exit, before finally closing the log files.
        print("... killing any workers")

        # rq workers must be signaled twice to actually shutdown.
        # we sleep in between to avoid a signal getting lost.
        try:
            print("sending first SIGINT")
            os.killpg(os.getpgid(0), signal.SIGINT)
            sleep(1)
            print("sending second SIGINT")
            os.killpg(os.getpgid(0), signal.SIGINT)
        except ProcessLookupError:
            print("process already killed")
        for w in worker_processes:
            w.wait()
    finally:
        for f in log_files:
            f.close()

    print("")
    print("All done!")
    sys.stdout.flush()
示例#4
0
def manage_run_results(job_id):
    return Job.fetch(job_id, connection=sjs.get_redis_conn()).result
示例#5
0
def jobs_running():
    registry = StartedJobRegistry(name=sjs.get_sjs_config()['queue'], connection=sjs.get_redis_conn())
    return registry.get_job_ids()
示例#6
0
def jobs_failed():
    failed_queue = get_failed_queue(connection=sjs.get_redis_conn())
    return failed_queue.get_job_ids()