Exemplo n.º 1
0
def start(skip_pre_checks):
    """
    Starts a new run, if one isn't already running.

    Runs all pre-checks, creates any necessary directories, archives the configs, records
    any arguments, and creates an env_record.
    """
    sjs.load()
    initialize_run(skip_pre_checks=skip_pre_checks)
Exemplo n.º 2
0
def start(skip_pre_checks):
    """
    Starts a new run, if one isn't already running.

    Runs all pre-checks, creates any necessary directories, archives the configs, records
    any arguments, and creates an env_record.
    """
    sjs.load()
    initialize_run(skip_pre_checks=skip_pre_checks)
Exemplo n.º 3
0
# prior to running this, you must:
#
# module load python/anaconda3-2.3.0-rhel

import sys
from time import sleep

import sjs

from sample_job import job_that_takes_a_long_time

filepath = sjs.DEFAULT_CONFIG_LOCATION
if len(sys.argv) > 1:
    filepath = sys.argv[1]

if not sjs.load(filepath):
    raise SystemExit()

sjs.run_pre_queue_checks(exit_on_fail=True)

redis_conn = sjs.get_redis_conn()
q = sjs.get_job_queue()

# enqueue sample jobs
jobs = []
jobs.append(q.enqueue(job_that_takes_a_long_time, 10))
jobs.append(q.enqueue(job_that_takes_a_long_time, 60))

# NOTE:
# Just because a job is queued, doesn't mean there are any workers to run it. If you are testing,
# you should go ahead and and start a worker with `rq worker`.
Exemplo n.º 4
0
def launch_workers(num_workers, burst, run_pre_checks, run_env_checks,
                   interval):
    os.makedirs("logs", exist_ok=True)

    if run_pre_checks:
        print("Running pre-checks...")
        sjs.run_pre_worker_checks(exit_on_fail=True)
        print("OK!")
    else:
        print("Skipping pre-checks!")

    working_dir = get_sjs_running_file()
    if not working_dir:
        raise SystemExit("Currently there is no run started (i.e. there is no %s file). " \
            "Are you in the correct directory?" % SJS_RUNNING_FILE)

    hostname = os.uname()[1]
    timestamp = datetime.now().strftime("%Y_%m_%d__%H_%M_%S")

    # compare env_record at start of run with this one
    env_record_dir = os.path.join(working_dir, 'env_records')
    env_record_path = os.path.join(env_record_dir,
                                   "%s_%s" % (hostname, timestamp))
    env = save_env_record(env_record_path)
    orig_env_record = read_env_record(
        os.path.join(env_record_dir, 'env_record_start.yaml'))
    if run_env_checks:
        print("Running env-checks...")
        if env != orig_env_record:
            print("env_record of this machine does not match env record of original machine! " \
                "Aborting launch workers! Please see %s to compare manually" % (env_record_path))
            raise SystemExit(
                "Env records do not match, aborting launch workers!")
        else:
            print("OK!")
    else:
        print("Skipping env-checks!")

    print("")
    print("Running on hostname %s" % hostname)
    print("Running at timestamp %s" % timestamp)
    print("Log name template: %s_%s_*.log" % (hostname, timestamp))
    print("Env record path: %s" % env_record_path)
    if burst:
        print("Running in burst mode. Workers and launch_workers script will exit when all " \
              "workers are idle and the queue is empty.")
    else:
        print(
            "Workers and launch_workers script will stay alive until killed.")

    print("")
    worker_processes = []
    log_files = []

    sjs.load()
    sjs_config = sjs.get_sjs_config()
    redis_cfg = sjs_config['redis']
    redis_url = "redis://%s:%s/%s" % (redis_cfg['host'], redis_cfg['port'],
                                      redis_cfg['db'])
    cmd = ['rq', 'worker', "-u", redis_url, sjs_config['queue']]

    for i in range(num_workers):
        logname = 'logs/%s_%s_%s.log' % (hostname, timestamp, i)
        print("Launching worker #%s with log file %s" % (i, logname))

        log = open(logname, 'w')
        proc = subprocess.Popen(cmd, stdout=log, stderr=log)

        worker_processes.append(proc)
        log_files.append(log)

    print("")
    print("Worker PIDS: %s" % [w.pid for w in worker_processes])

    try:
        conn = sjs.get_redis_conn()

        if 'min_seconds_per_job' in sjs_config or burst == False:
            # more complex case of either handling bursted workers, or handling min_seconds_per_job
            # timeout. Here we run a loop and check conditions each run through the loop.
            while True:
                sleep(interval)

                if burst:
                    # there is no point killing workers on the node unless all of them are idle and
                    # we can kill all the workers and release the node. So here we poll for the
                    # current worker state and if all the workers are idle AND the queue is empty,
                    # then we shut the node down.
                    workers = [
                        w for w in Worker.all(connection=conn)
                        if w.name.startswith(hostname)
                    ]
                    idle_workers = [w for w in workers if w.state == 'idle']
                    if len(idle_workers) == len(workers) and len(
                            sjs.get_job_queue()) == 0:
                        print("All workers idle; queue is empty.")
                        disable_signals()
                        raise SystemExit()

                if 'min_seconds_per_job' in sjs_config:
                    try:
                        results = subprocess.check_output(
                            "qstat -i $PBS_JOBID",
                            shell=True,
                            universal_newlines=True)
                        hours, minutes, seconds = results.strip().split(
                            "\n")[-1][-8:].split(":")
                        walltime_remaining = int(hours) * 3600 + int(
                            minutes) * 60 + int(seconds)

                        if sjs_config[
                                'min_seconds_per_job'] > walltime_remaining:
                            print("walltime remaining is less than the min seconds required per " \
                                  "job. Sending SIGINTs to workers so they exit when the " \
                                  "currently running job is complete")
                            for worker in worker_processes:
                                os.kill(worker.pid, signal.SIGINT)
                            break

                    except Exception as e:
                        print("Failure getting walltime", e)

        # the simplest case of just running the workers until they exit
        print("Waiting for workers to exit...")
        for w in worker_processes:
            w.wait()

    except SystemExit:
        # if this process is forced to exit, we kill the workers, and wait for them to
        # exit, before finally closing the log files.
        print("... killing any workers")

        # rq workers must be signaled twice to actually shutdown.
        # we sleep in between to avoid a signal getting lost.
        try:
            print("sending first SIGINT")
            os.killpg(os.getpgid(0), signal.SIGINT)
            sleep(1)
            print("sending second SIGINT")
            os.killpg(os.getpgid(0), signal.SIGINT)
        except ProcessLookupError:
            print("process already killed")
        for w in worker_processes:
            w.wait()
    finally:
        for f in log_files:
            f.close()

    print("")
    print("All done!")
    sys.stdout.flush()
Exemplo n.º 5
0
                                                      read_mof_configuration,
                                                      run_composition_simulation)

mofs_filepath = sys.argv[1]
gas_comps_filepath = sys.argv[2]
pressure = sys.argv[3]

mofs = read_mof_configuration(mofs_filepath)
compositions = read_composition_configuration(gas_comps_filepath)
gases = list(compositions[0].keys())

run_name = generate_unique_run_name()
output_dir = 'output_%s' % run_name
os.makedirs(output_dir)

sjs.load(os.path.join("settings","sjs.yaml"))
job_queue = sjs.get_job_queue()

# setup CSV file and write header
f = open(os.path.join(output_dir, 'comp_mass_output.csv'),'w',newline='')
# write header
header = ['Run ID','MOF','Mass']
for gas in gases:
    header.append(gas)
writer = csv.writer(f, delimiter='\t')
writer.writerow(header)

if job_queue is not None:
    print("Queueing jobs onto queue: %s" % job_queue)

    run_id_number = 0
Exemplo n.º 6
0
import curses
from datetime import datetime
import signal
import subprocess
import sys
from time import sleep

import click
from rq import Worker, Queue

import sjs
from sjs.run import initialize_run, end_run, run_started
from sjs.curses_fullscreen import curses_fullscreen

sjs.load()
conn = sjs.get_redis_conn()
stdscr = None
maxyx = None
def disable_signals():
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGTERM, signal.SIG_IGN)

def signal_handler(signal_received, frame):
    disable_signals()
    raise SystemExit("Received signal %s." % signal_received)

signal.signal(signal.SIGINT, signal_handler)
signal.signal(signal.SIGTERM, signal_handler)

def job_string(j):
    if j is None:
Exemplo n.º 7
0
from ipmof.parameters import read_parameters

# Read simulation parameters and directories
sim_par, sim_dir = read_parameters()

# Get list of interpenetrating MOFs
interpenetration_list = get_interpenetration_list(sim_par, sim_dir)
print('Initializing interpenetration for', len(interpenetration_list), 'MOF combinations...')

for ip_index, interpenetration_path in enumerate(interpenetration_list, start=1):

    emap_path, emap_mof_path, ip_mof_path = interpenetration_path
    print('-' * 80 + '\n' + str(ip_index), 'Energy map ->', os.path.basename(emap_path))
    print(' ' * len(str(ip_index)), 'Interpenetration ->', os.path.basename(ip_mof_path) + '\n' + '-' * 80)

    # Run interpenetration
    if sys.argv[-1] == 'q':
        # Load job server libraries
        from rq import Queue
        from redis import Redis
        import sjs

        # Load job queue
        sjs.load(os.path.join("settings", "sjs.yaml"))
        job_queue = sjs.get_job_queue()

        # Run interpenetration
        job_queue.enqueue(run_interpenetration, interpenetration_path, sim_par, sim_dir)
    else:
        run_interpenetration(interpenetration_path, sim_par, sim_dir)
Exemplo n.º 8
0
import curses
from datetime import datetime
import signal
import subprocess
import sys
from time import sleep

import click
from rq import Worker, Queue

import sjs
from sjs.run import initialize_run, end_run, run_started
from sjs.curses_fullscreen import curses_fullscreen

sjs.load()
conn = sjs.get_redis_conn()
stdscr = None
maxyx = None


def disable_signals():
    signal.signal(signal.SIGINT, signal.SIG_IGN)
    signal.signal(signal.SIGTERM, signal.SIG_IGN)


def signal_handler(signal_received, frame):
    disable_signals()
    raise SystemExit("Received signal %s." % signal_received)


signal.signal(signal.SIGINT, signal_handler)
Exemplo n.º 9
0
def finalize():
    """
    Finalizes the run, archives the config and all data directories, and outputs status.
    """
    sjs.load()
    end_run()
Exemplo n.º 10
0
def finalize():
    """
    Finalizes the run, archives the config and all data directories, and outputs status.
    """
    sjs.load()
    end_run()