def kill_job(self, job): """ Notify Worker that a job should be killed Args: job (modelrunner.Job): job instance """ if job.status == Job.STATUS_QUEUED: # case 1: job is in QUEUED state # remove it from the queue and mark as killed job_queue = job_queue_name(job.model) logger.info( "killing job {} by removing from queue {}". format(job.uuid, job_queue)) command_dict = {'command': 'PROCESS_JOB', 'job_uuid': job.uuid} remove_command(redis_connection(), job_queue, command_dict) job.status = Job.STATUS_KILLED # save it Job[job.uuid] = job elif job.status == Job.STATUS_RUNNING: # case 2: job is in RUNNING state # send message to worker to kill the job worker = worker_name(job.worker_url, job.model) worker_channel = node_channel_name(worker) logger.info("sending command to kill job on channel {}". format(worker_channel)) command_dict = {'command': "KILL_JOB", 'job_uuid': job.uuid} publish_command(redis_connection(), worker_channel, command_dict) else: logger.info("kill called on job {} in incompatible state {}". format(job.uuid, job.status))
def test_run_good_bad(): model_name = "test" config = make_config(model_name) worker = get_worker(config) sleep8_job = setup_queued_job(config, "processed_test", "sleep_8.zip") bad_job = setup_queued_job(config, "failed_test", "bad.zip") enqueue_worker_job(sleep8_job) enqueue_worker_job(bad_job) # process good and bad jobs in bg thread tq = Thread(target=worker.wait_for_queue_commands) tq.start() # give it some time time.sleep(10) assert Job[sleep8_job.uuid].status == Job.STATUS_PROCESSED assert Job[bad_job.uuid].status == Job.STATUS_FAILED # stop waiting stop_queue_command = {'command': 'STOP_PROCESSING_QUEUE'} enqueue_command( redis_connection(), job_queue_name(model_name), stop_queue_command) tq.join() cleanup(config)
def enqueue_complete_job(job): """ Submit job to queue for primary """ redis_conn = redis_connection() queue_name = primary_queue_name(job.primary_url) command_dict = {'command': 'COMPLETE_JOB', 'job_uuid': job.uuid} enqueue_command(redis_conn, queue_name, command_dict)
def enqueue_worker_job(job): """ Submit job to queue for worker """ redis_conn = redis_connection() queue_name = job_queue_name(job.model) command_dict = {'command': 'PROCESS_JOB', 'job_uuid': job.uuid} enqueue_command(redis_conn, queue_name, command_dict)
def enqueue_job(self, queue): job = {'id': self.count, 'origin': self.name, 'status': 'NEW'} command_dict = {'command': 'PROCESS_JOB', 'job': job} enqueue_command(redis_connection(), queue, command_dict) self.count += 1
def publish(channel_name, command_dict, wait_time=0): """ Test worker command processing """ redis_conn = redis_connection() if wait_time > 0: time.sleep(wait_time) publish_command(redis_conn, channel_name, command_dict)
def get_primary(config): primary_handler = PrimaryServer( config["primary_url"], config["primary_data_dir"]) channels = [node_channel_name(primary_handler.node.name), all_nodes_channel_name()] primary = Dispatcher( redis_connection(), primary_handler, primary_queue_name(primary_handler.node.name), channels) return primary
def get_worker(config): worker_handler = WorkerServer( config["worker_url"], config["worker_data_dir"], config["model"], config["command_dict"]) channels = [node_channel_name(worker_handler.node.name), all_nodes_channel_name()] worker = Dispatcher(redis_connection(), worker_handler, job_queue_name(config["model"]), channels) return worker
def cleanup(config): redis_conn = redis_connection() def delete_subdirs(d): for subdir in os.listdir(d): full_subdir = os.path.join(d, subdir) if os.path.isdir(full_subdir): shutil.rmtree(full_subdir, ignore_errors=True) delete_subdirs(config["primary_data_dir"]) delete_subdirs(config["worker_data_dir"]) redis_conn.flushdb()
def process_job(self, command_dict): job = command_dict['job'] self.jobs[job['id']] = job job['status'] = 'PROCESSING' sleep_amount = 0 while (sleep_amount < self.sleep_time): time.sleep(1) if job['status'] == 'KILLING': job['status'] = 'KILLED' break sleep_amount += 1 if sleep_amount == self.sleep_time: job['status'] = 'PROCESSED' command_dict = {'command': 'COMPLETE_JOB', 'job': job} enqueue_command(redis_connection(), job['origin'], command_dict)
def process_job(self, command_dict): job = command_dict['job'] self.jobs[job['id']] = job job['status'] = 'PROCESSING' sleep_amount = 0 while(sleep_amount < self.sleep_time): time.sleep(1) if job['status'] == 'KILLING': job['status'] = 'KILLED' break sleep_amount += 1 if sleep_amount == self.sleep_time: job['status'] = 'PROCESSED' command_dict = {'command': 'COMPLETE_JOB', 'job': job} enqueue_command(redis_connection(), job['origin'], command_dict)
def refresh_node_status(self): """ Refresh the status of all nodes by 1. Deleting existing state 2. Publishing a request for all nodes to update All listening nodes will update the Node hash with their state """ for node in Node.values(): del Node[node.name] status_command = {"command": "UPDATE_STATUS"} publish_command( redis_connection(), all_nodes_channel_name(), status_command)
def enqueue(self, job, job_data_blob=None, job_data_url=None): """ Write job data to file and queue up for processing Note: This should be run async wrt a web server as it will block on fetching/writing data Args: job_data_blob (blob): blob of a zip file to be written to disk job_data_url (str): the url of a zip file to fetched """ # only allow job data as blob or url assert((job_data_blob is None) ^ (job_data_url is None)) job_data_dir = os.path.join(self.data_dir, job.uuid) if(not os.path.exists(job_data_dir)): os.mkdir(job_data_dir) job_data_file = os.path.join(job_data_dir, "input.zip") if(job_data_blob): logger.info("writing input file for job to {}". format(job_data_file)) file_handle = open(job_data_file, 'wb') file_handle.write(job_data_blob) file_handle.close() else: logger.info("retrieving input file for job and writing to {}". format(job_data_file)) fetch_file_from_url(job_data_url, job_data_dir, "input.zip") # add to global job list then queue it to be run job.primary_url = self.node.node_url job.primary_data_dir = self.data_dir # to know where output.zip is job.status = Job.STATUS_QUEUED Job[job.uuid] = job job_queue = job_queue_name(job.model) command_dict = {'command': 'PROCESS_JOB', 'job_uuid': job.uuid} enqueue_command(redis_connection(), job_queue, command_dict)
def test_primary_worker_scenario(): primary_handler = PrimaryCommandHandler() worker_handler = WorkerCommandHandler() primary = Dispatcher(redis_connection(), primary_handler, "primary", ["primary"]) worker = Dispatcher(redis_connection(), worker_handler, "worker", ["worker"]) # start them up Thread(target=primary.wait_for_queue_commands).start() Thread(target=primary.wait_for_channel_commands).start() Thread(target=worker.wait_for_queue_commands).start() Thread(target=worker.wait_for_channel_commands).start() # submit a job primary_handler.enqueue_job("worker") # wait for it to complete sleep_time = 0 while(sleep_time < worker_handler.sleep_time + 1): time.sleep(1) sleep_time += 1 assert len(primary_handler.jobs) == 1 and\ primary_handler.jobs[0]['status'] == 'COMPLETE' stop_queue_command = {'command': 'STOP_PROCESSING_QUEUE'} stop_channel_command = {'command': 'STOP_PROCESSING_CHANNELS'} publish_command(redis_connection(), "worker", stop_queue_command) publish_command(redis_connection(), "primary", stop_queue_command) publish_command(redis_connection(), "worker", stop_channel_command) publish_command(redis_connection(), "primary", stop_channel_command)
def test_primary_worker_scenario(): primary_handler = PrimaryCommandHandler() worker_handler = WorkerCommandHandler() primary = Dispatcher(redis_connection(), primary_handler, "primary", ["primary"]) worker = Dispatcher(redis_connection(), worker_handler, "worker", ["worker"]) # start them up Thread(target=primary.wait_for_queue_commands).start() Thread(target=primary.wait_for_channel_commands).start() Thread(target=worker.wait_for_queue_commands).start() Thread(target=worker.wait_for_channel_commands).start() # submit a job primary_handler.enqueue_job("worker") # wait for it to complete sleep_time = 0 while (sleep_time < worker_handler.sleep_time + 1): time.sleep(1) sleep_time += 1 assert len(primary_handler.jobs) == 1 and\ primary_handler.jobs[0]['status'] == 'COMPLETE' stop_queue_command = {'command': 'STOP_PROCESSING_QUEUE'} stop_channel_command = {'command': 'STOP_PROCESSING_CHANNELS'} publish_command(redis_connection(), "worker", stop_queue_command) publish_command(redis_connection(), "primary", stop_queue_command) publish_command(redis_connection(), "worker", stop_channel_command) publish_command(redis_connection(), "primary", stop_channel_command)
def kill_job(self, channel): command_dict = {'command': 'KILL_JOB'} publish_command(redis_connection(), channel, command_dict)
logger.info("modelrunner %s (Python %s)" % (__version__, '.'.join(map(str, sys.version_info[:3])))) # so we can load config via cmd line args parse_command_line() parse_config_file(config.options.config_file) # initialize the global application settings initialize(config.options.redis_url) # get the command_ keys command_dict = config.options.group_dict("model_command") worker_handler = WorkerServer( config.options.worker_url, config.options.data_dir, config.options.model, command_dict) channels = [node_channel_name(worker_handler.node.name), all_nodes_channel_name()] worker = Dispatcher(redis_connection(), worker_handler, job_queue_name(config.options.model), channels) # start listening for commands on queue and channels in bg Thread(target=worker.wait_for_queue_commands).start() Thread(target=worker.wait_for_channel_commands).start()
def process_job(self, command_dict): """ process job command format {'command': 'PROCESS_JOB', 'job_uuid': <uuid>} """ job_uuid = command_dict['job_uuid'] try: job = Job[job_uuid] except KeyError as e: # Job not found is not worth re-raising logger.warn(e) logger.warn("Job {} missing".format(job_uuid)) return # assign the job to this worker job.worker_url = self.node.node_url job.worker_data_dir = self.data_dir job_data_dir = self._setup_job_dir(job) # setup subproc to run model command and output to local job log logger.info("preparing input for job {}".format(job.uuid)) job_data_log = open(os.path.join(job_data_dir, "job_log.txt"), 'w') # primary_queue to notify primary server of any errors or completion primary_queue = primary_queue_name(job.primary_url) # update job status job.status = Job.STATUS_RUNNING job.on_primary = False # now on worker Job[job.uuid] = job # catch data prep exceptions so that we mark the job as failed try: self._prep_input(job) except: # Fail the job, log it and notify primary failure_msg = "Failed prepping data for job {}".format(job.uuid) logger.error(failure_msg) job_data_log.write(failure_msg) job_data_log.close() job.status = Job.STATUS_FAILED Job[job.uuid] = job command_dict = {'command': 'COMPLETE_JOB', 'job_uuid': job.uuid} enqueue_command(redis_connection(), primary_queue, command_dict) return # Input has been prepped so start the job command = self.model_commands[self.node.model] logger.info("starting job {}".format(job.uuid)) # add the input and output dir to the command popen_proc = self._run_subprocess(command, job, job_data_log) # set hidden status attributes self.set_node_status(Node.STATUS_RUNNING, job_uuid=job.uuid, job_pid=popen_proc.pid) logger.info("job {} running with pid {}".format( job.uuid, popen_proc.pid)) # wait for command to finish or for it to be killed return_code = popen_proc.wait() # Reset hidden status attributes self.set_node_status(Node.STATUS_WAITING) # close job log job_data_log.close() logger.info("finished job {} with return code {}".format( job.uuid, return_code)) # update job status (use command return code for now) if (return_code == 0): logger.info("zipping output of job {}".format(job.uuid)) self._prep_output(job) job.status = Job.STATUS_PROCESSED elif return_code == -signal.SIGKILL: job.status = Job.STATUS_KILLED else: job.status = Job.STATUS_FAILED Job[job.uuid] = job # notify primary server job is done command_dict = {'command': 'COMPLETE_JOB', 'job_uuid': job.uuid} enqueue_command(redis_connection(), primary_queue, command_dict)
# setup log logger = logging.getLogger('modelrunner') logger.info("modelrunner %s (Python %s)" % (__version__, '.'.join(map(str, sys.version_info[:3])))) # so we can load config via cmd line args parse_command_line() parse_config_file(config.options.config_file) # initialize the global application settings initialize(config.options.redis_url) # get the command_ keys command_dict = config.options.group_dict("model_command") worker_handler = WorkerServer(config.options.worker_url, config.options.data_dir, config.options.model, command_dict) channels = [ node_channel_name(worker_handler.node.name), all_nodes_channel_name() ] worker = Dispatcher(redis_connection(), worker_handler, job_queue_name(config.options.model), channels) # start listening for commands on queue and channels in bg Thread(target=worker.wait_for_queue_commands).start() Thread(target=worker.wait_for_channel_commands).start()
# -*- coding: utf-8 -*- from modelrunner import settings from modelrunner.redisent import RedisEntity import datetime from six import string_types RedisEntity._prefix = "test" RedisEntity._db = settings.redis_connection() class User(RedisEntity): def __init__(self, id=None, name=None, created=None): self.id = int(id) self.name = name self.created = self._init_created(created) def _init_created(self, created): if isinstance(created, string_types): return datetime.datetime.strptime(created, "%Y-%m-%dT%H:%M:%S") elif isinstance(created, datetime.datetime): return created else: raise ValueError("Invalid type {} for created attribute".format( type(created))) def __eq__(self, other): return (isinstance(other, self.__class__) and self.__dict__ == other.__dict__)
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Script to stop the configured redis instance """ from modelrunner import config from modelrunner.settings import (initialize, redis_connection) # setup config options from tornado.options import parse_command_line, parse_config_file # so we can load config via cmd line args parse_command_line() parse_config_file(config.options.config_file) # initialize the global application settings initialize(config.options.redis_url) # stop redis redis_connection().shutdown()
#!/usr/bin/env python # -*- coding: utf-8 -*- """ Script to stop the configured redis instance """ from modelrunner import config from modelrunner.settings import ( initialize, redis_connection ) # setup config options from tornado.options import parse_command_line, parse_config_file # so we can load config via cmd line args parse_command_line() parse_config_file(config.options.config_file) # initialize the global application settings initialize(config.options.redis_url) # stop redis redis_connection().shutdown()
def get_queued_commands(): return get_all_commands( redis_connection(), job_queue_name(model_name))
logger = logging.getLogger('modelrunner') logger.info("modelrunner %s (Python %s)" % (__version__, '.'.join(map(str, sys.version_info[:3])))) # so we can load config via cmd line args parse_command_line() parse_config_file(config.options.config_file) # initialize the global application settings initialize(config.options.redis_url) # get the command_ keys command_dict = config.options.group_dict("model_command") primary_handler = PrimaryServer( config.options.primary_url, config.options.data_dir) channels = [node_channel_name(primary_handler.node.name), all_nodes_channel_name()] primary = Dispatcher( redis_connection(), primary_handler, primary_queue_name(primary_handler.node.name), channels) # continuously wait for jobs to complete and for status inquiries Thread(target=primary.wait_for_queue_commands).start() Thread(target=primary.wait_for_channel_commands).start()
primary_queue_name) # setup log logger = logging.getLogger('modelrunner') logger.info("modelrunner %s (Python %s)" % (__version__, '.'.join(map(str, sys.version_info[:3])))) # so we can load config via cmd line args parse_command_line() parse_config_file(config.options.config_file) # initialize the global application settings initialize(config.options.redis_url) # get the command_ keys command_dict = config.options.group_dict("model_command") primary_handler = PrimaryServer(config.options.primary_url, config.options.data_dir) channels = [ node_channel_name(primary_handler.node.name), all_nodes_channel_name() ] primary = Dispatcher(redis_connection(), primary_handler, primary_queue_name(primary_handler.node.name), channels) # continuously wait for jobs to complete and for status inquiries Thread(target=primary.wait_for_queue_commands).start() Thread(target=primary.wait_for_channel_commands).start()