def work(self): # We should probably open up our own redis client self.client = qless.client(url=self.host) self.queues = [self.client.queues[q] for q in self.queues] if not os.path.isdir(self.sandbox): os.makedirs(self.sandbox) self.clean() # First things first, we should clear out any jobs that # we're responsible for off-hand while len(self.jids): try: job = self.client.jobs[self.jids.pop(0)] # If we still have access to it, then we should process it if job.heartbeat(): logger.info('Resuming %s' % job.jid) self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name)) job.process() self.clean() else: logger.warn('Lost heart on would-be resumed job %s' % job.jid) except KeyboardInterrupt: return sleep_cycles = 0 while True: try: for queue in self.queues: job = queue.pop() if job: sleep_cycles = -1 self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name)) job.process() self.clean() if self.stop_on_idle and sleep_cycles >= 2: logger.info("Idle for too long, quiting") import sys sys.exit(self.IDLE_EXIT_STATUS) if sleep_cycles >= 0: self.setproctitle('sleeping...') logger.debug('Sleeping for %fs' % self.interval) time.sleep(self.interval) sleep_cycles += 1 else: sleep_cycles = 0 except KeyboardInterrupt: return
def stop(self, sig=signal.SIGINT): '''Stop all the workers, and then wait for them''' for cpid in self.sandboxes.keys(): logger.warn('Stopping %i...' % cpid) os.kill(cpid, sig) # While we still have children running, wait for them for cpid in self.sandboxes.keys(): try: logger.info('Waiting for %i...' % cpid) pid, status = os.waitpid(cpid, 0) logger.warn('%i stopped with status %i' % (pid, status >> 8)) except OSError: # pragma: no cover logger.exception('Error waiting for %i...' % cpid) finally: self.sandboxes.pop(pid, None)
def run(self): '''Run this worker''' self.signals(('TERM', 'INT', 'QUIT')) # Divide up the jobs that we have to divy up between the workers. This # produces evenly-sized groups of jobs resume = self.divide(self.resume, self.count) for index in range(self.count): # The sandbox for the child worker sandbox = os.path.join(os.getcwd(), 'qless-py-workers', 'sandbox-%s' % index) cpid = os.fork() if cpid: logger.info('Spawned worker %i' % cpid) self.sandboxes[cpid] = sandbox else: # pragma: no cover # Move to the sandbox as the current working directory with Worker.sandbox(sandbox): os.chdir(sandbox) try: self.spawn(resume=resume[index], sandbox=sandbox).run() except: logger.exception('Exception in spawned worker') finally: os._exit(0) try: while not self.shutdown: pid, status = os.wait() logger.warn('Worker %i died with status %i from signal %i' % (pid, status >> 8, status & 0xff)) sandbox = self.sandboxes.pop(pid) cpid = os.fork() if cpid: logger.info('Spawned replacement worker %i' % cpid) self.sandboxes[cpid] = sandbox else: # pragma: no cover with Worker.sandbox(sandbox): os.chdir(sandbox) try: self.spawn(sandbox=sandbox).run() except: logger.exception('Exception in spawned worker') finally: os._exit(0) finally: self.stop(signal.SIGKILL)
def run(self): '''Run this worker''' self.signals(('TERM', 'INT', 'QUIT')) # Divide up the jobs that we have to divy up between the workers. This # produces evenly-sized groups of jobs resume = self.divide(self.resume, self.count) for index in range(self.count): # The sandbox for the child worker sandbox = os.path.join( os.getcwd(), 'qless-py-workers', 'sandbox-%s' % index) cpid = os.fork() if cpid: logger.info('Spawned worker %i' % cpid) self.sandboxes[cpid] = sandbox else: # pragma: no cover # Move to the sandbox as the current working directory with Worker.sandbox(sandbox): os.chdir(sandbox) try: self.spawn(resume=resume[index], sandbox=sandbox).run() except: logger.exception('Exception in spawned worker') finally: os._exit(0) try: while not self.shutdown: pid, status = os.wait() logger.warn('Worker %i died with status %i from signal %i' % ( pid, status >> 8, status & 0xff)) sandbox = self.sandboxes.pop(pid) cpid = os.fork() if cpid: logger.info('Spawned replacement worker %i' % cpid) self.sandboxes[cpid] = sandbox else: # pragma: no cover with Worker.sandbox(sandbox): os.chdir(sandbox) try: self.spawn(sandbox=sandbox).run() except: logger.exception('Exception in spawned worker') finally: os._exit(0) finally: self.stop(signal.SIGKILL)
def work(self): # We should probably open up our own redis client self.client = qless.client(self.host, self.port) self.queues = [self.client.queues[q] for q in self.queues] if not os.path.isdir(self.sandbox): os.makedirs(self.sandbox) self.clean() # First things first, we should clear out any jobs that # we're responsible for off-hand while len(self.jids): try: job = self.client.jobs[self.jids.pop(0)] # If we still have access to it, then we should process it if job.heartbeat(): logger.info('Resuming %s' % job.jid) self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name)) job.process() self.clean() else: logger.warn('Lost heart on would-be resumed job %s' % job.jid) except KeyboardInterrupt: return while True: try: seen = False for queue in self.queues: job = queue.pop() if job: seen = True self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name)) job.process() self.clean() if not seen: self.setproctitle('sleeping...') logger.debug('Sleeping for %fs' % self.interval) time.sleep(self.interval) except KeyboardInterrupt: return
def stop(self, sig=signal.SIGINT): '''Stop all the workers, and then wait for them''' for cpid in self.sandboxes.keys(): logger.warn('Stopping %i...' % cpid) try: os.kill(cpid, sig) except OSError: # pragma: no cover logger.exception('Error stopping %s...' % cpid) # While we still have children running, wait for them for cpid in self.sandboxes.keys(): try: logger.info('Waiting for %i...' % cpid) pid, status = os.waitpid(cpid, 0) logger.warn('%i stopped with status %i' % (pid, status >> 8)) except OSError: # pragma: no cover logger.exception('Error waiting for %i...' % cpid) finally: self.sandboxes.pop(cpid, None)
def stop(self, sig=signal.SIGINT): '''Stop all the workers, and then wait for them''' for cpid in self.sandboxes: logger.warn('Stopping %i...' % cpid) try: os.kill(cpid, sig) except OSError: # pragma: no cover logger.exception('Error stopping %s...' % cpid) # While we still have children running, wait for them # We edit the dictionary during the loop, so we need to copy its keys for cpid in list(self.sandboxes): try: logger.info('Waiting for %i...' % cpid) pid, status = os.waitpid(cpid, 0) logger.warn('%i stopped with status %i' % (pid, status >> 8)) except OSError: # pragma: no cover logger.exception('Error waiting for %i...' % cpid) finally: self.sandboxes.pop(cpid, None)
def fail(self, group, message): """Mark the particular job as failed, with the provided type, and a more specific message. By `type`, we mean some phrase that might be one of several categorical modes of failure. The `message` is something more job-specific, like perhaps a traceback. This method should __not__ be used to note that a job has been dropped or has failed in a transient way. This method __should__ be used to note that a job has something really wrong with it that must be remedied. The motivation behind the `type` is so that similar errors can be grouped together. Optionally, updated data can be provided for the job. A job in any state can be marked as failed. If it has been given to a worker as a job, then its subsequent requests to heartbeat or complete that job will fail. Failed jobs are kept until they are canceled or completed. __Returns__ the id of the failed job if successful, or `False` on failure.""" logger.warn("Failing %s (%s): %s" % (self.jid, group, message)) return self.client("fail", self.jid, self.client.worker_name, group, message, json.dumps(self.data)) or False
def work(self): # We should probably open up our own redis client self.client = qless.client(self.host, self.port, password=self.password) self.queues = [self.client.queues[q] for q in self.queues] if not os.path.isdir(self.sandbox): os.makedirs(self.sandbox) self.clean() # First things first, we should clear out any jobs that # we're responsible for off-hand while len(self.jids): try: job = self.client.jobs[self.jids.pop(0)] # If we still have access to it, then we should process it if job.heartbeat(): logger.info('Resuming %s' % job.jid) self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name)) job.process() self.clean() else: logger.warn('Lost heart on would-be resumed job %s' % job.jid) except KeyboardInterrupt: return while True: try: seen = False for queue in self.queues: job = queue.pop() if job: seen = True self.setproctitle('Working %s (%s)' % (job.jid, job.klass_name)) job.process() self.clean() if not seen: self.setproctitle('sleeping...') logger.debug('Sleeping for %fs' % self.interval) time.sleep(self.interval) except KeyboardInterrupt: return
def handler(self, signum, frame): # pragma: no cover '''Signal handler for this process''' if signum == signal.SIGQUIT: # QUIT - Finish processing, but don't do any more work after that self.stop() elif signum == signal.SIGUSR1: # USR1 - Print the backtrace message = ''.join(traceback.format_stack(frame)) message = 'Signaled traceback for %s:\n%s' % (os.getpid(), message) print(message, file=sys.stderr) logger.warn(message) elif signum == signal.SIGUSR2: # USR2 - Enter a debugger # Much thanks to http://stackoverflow.com/questions/132058 data = {'_frame': frame} # Allow access to frame object. data.update(frame.f_globals) # Unless shadowed by global data.update(frame.f_locals) # Build up a message with a traceback message = ''.join(traceback.format_stack(frame)) message = 'Traceback:\n%s' % message code.InteractiveConsole(data).interact(message)
def fail(self, group, message): '''Mark the particular job as failed, with the provided type, and a more specific message. By `type`, we mean some phrase that might be one of several categorical modes of failure. The `message` is something more job-specific, like perhaps a traceback. This method should __not__ be used to note that a job has been dropped or has failed in a transient way. This method __should__ be used to note that a job has something really wrong with it that must be remedied. The motivation behind the `type` is so that similar errors can be grouped together. Optionally, updated data can be provided for the job. A job in any state can be marked as failed. If it has been given to a worker as a job, then its subsequent requests to heartbeat or complete that job will fail. Failed jobs are kept until they are canceled or completed. __Returns__ the id of the failed job if successful, or `False` on failure.''' logger.warn('Failing %s (%s): %s', self.jid, group, message) return self.client('fail', self.jid, self.client.worker_name, group, message, json.dumps(self.data)) or False
def _import(klass): '''1) Get a reference to the module 2) Check the file that module's imported from 3) If that file's been updated, force a reload of that module return it''' mod = __import__(klass.rpartition('.')[0]) for segment in klass.split('.')[1:-1]: mod = getattr(mod, segment) # Alright, now check the file associated with it. Note that clases # defined in __main__ don't have a __file__ attribute if klass not in BaseJob._loaded: BaseJob._loaded[klass] = time.time() if hasattr(mod, '__file__'): try: mtime = os.stat(mod.__file__).st_mtime if BaseJob._loaded[klass] < mtime: mod = reload_module(mod) except OSError: logger.warn('Could not check modification time of %s', mod.__file__) return getattr(mod, klass.rpartition('.')[2])
def stop(self): # Stop all the workers, and then wait for them for cpid in self.sandboxes.keys(): logger.warn('Stopping %i...' % cpid) os.kill(cpid, signal.SIGINT) while True: try: pid, status = os.wait() self.sandboxes.pop(pid, None) logger.warn('Worker %i stopped.' % cpid) except OSError: break for cpid in self.sandboxes.keys(): logger.warn('Could not wait for %i' % cpid)
def kill(self, jid): '''Stop the greenlet processing the provided jid''' greenlet = self.greenlets.get(jid) if greenlet != None: logger.warn('Lost ownership of %s' % jid) greenlet.kill()
def run(self): # If this worker is meant to be resumable, then we should find out # what jobs this worker was working on beforehand. if self.resume: jids_to_resume = self.client.workers[self.client.worker_name]['jobs'] else: jids_to_resume = [] pids = [] for i in range(self.count): slot = { 'worker_id': i, 'sandbox' : os.path.join(self.workdir, 'qless-py-workers', 'sandbox-%i' % i) } cpid = os.fork() if cpid: logger.info('Spawned worker %i' % cpid) self.sandboxes[cpid] = slot pids.append(str(cpid)) else: # Set the value of the metadata so that jobs can detect # what worker they're running on import qless.worker qless.worker.meta = slot # Make note that we're not the master, and then save our # sandbox and worker id for reference self.master = False self.sandbox = slot['sandbox'] self.worker_id = slot['worker_id'] # Also, we should take our share of the jobs that we want # to resume, if any. start = (i * len(jids_to_resume)) / self.count end = ((i+1) * len(jids_to_resume)) / self.count self.jids = jids_to_resume[start:end] return self.work() f=open(os.path.join(self.workdir, 'workers-pid.txt'),'w') f.write(str(os.getpid())) f.write('\n') for pid in pids: f.write(pid) f.write('\n') f.close() while self.master: try: pid, status = os.wait() logger.warn('Worker %i died with status %i from signal %i' % (pid, status >> 8, status & 0xff)) slot = self.sandboxes.pop(pid) cpid = os.fork() if cpid: logger.info('Spawned replacement worker %i' % cpid) self.sandboxes[cpid] = slot else: # Set the value of the metadata so that jobs can detect # what worker they're running on import qless.worker qless.worker.meta = slot # Make note that we're not the master, and then save our # sandbox and worker id for reference self.master = False self.sandbox = slot['sandbox'] self.worker_id = slot['worker_id'] # NOTE: In the case that the worker died, we're going to # assume that something about the job(s) it was working # made the worker exit, and so we're going to ignore any # jobs that we might have been working on. It's also # significantly more difficult than the above problem of # simply distributing work to /new/ workers, rather than # a respawned worker. return self.work() except KeyboardInterrupt: break if self.master: self.stop()