def async_job(verb, job, threadpool, locks, event_queue, log_path): """Run a sequence of Stages from a Job and collect their output. :param job: A Job instance :threadpool: A thread pool executor for blocking stages :event_queue: A queue for asynchronous events """ # Initialize success flag all_stages_succeeded = True # Jobs start occuping a jobserver job occupying_job = True # Load environment for this job job_env = job.getenv(os.environ) # Execute each stage of this job for stage in job.stages: # Logger reference in this scope for error reporting logger = None # Abort the job if one of the stages has failed if job.continue_on_failure and not all_stages_succeeded: break # Check for stage synchronization lock if stage.locked_resource is not None: lock = locks.setdefault(stage.locked_resource, asyncio.Lock()) yield asyncio.From(lock) else: lock = FakeLock() try: # If the stage doesn't require a job token, release it temporarily if stage.occupy_job: if not occupying_job: while job_server.try_acquire() is None: yield asyncio.From(asyncio.sleep(0.05)) occupying_job = True else: if occupying_job: job_server.release() occupying_job = False # Notify stage started event_queue.put(ExecutionEvent( 'STARTED_STAGE', job_id=job.jid, stage_label=stage.label)) if type(stage) is CommandStage: try: # Initiate the command while True: try: # Update the environment for this stage (respects overrides) stage.update_env(job_env) # Get the logger protocol_type = stage.logger_factory(verb, job.jid, stage.label, event_queue, log_path) # Start asynchroonous execution transport, logger = yield asyncio.From( async_execute_process( protocol_type, **stage.async_execute_process_kwargs)) break except OSError as exc: if 'Text file busy' in str(exc): # This is a transient error, try again shortly # TODO: report the file causing the problem (exc.filename) yield asyncio.From(asyncio.sleep(0.01)) continue raise # Notify that a subprocess has been created event_queue.put(ExecutionEvent( 'SUBPROCESS', job_id=job.jid, stage_label=stage.label, stage_repro=stage.get_reproduction_cmd(verb, job.jid), **stage.async_execute_process_kwargs)) # Asynchronously yield until this command is completed retcode = yield asyncio.From(logger.complete) except: logger = IOBufferLogger(verb, job.jid, stage.label, event_queue, log_path) logger.err(str(traceback.format_exc())) retcode = 3 elif type(stage) is FunctionStage: logger = IOBufferLogger(verb, job.jid, stage.label, event_queue, log_path) try: # Asynchronously yield until this function is completed retcode = yield asyncio.From(get_loop().run_in_executor( threadpool, stage.function, logger, event_queue)) except: logger.err('Stage `{}` failed with arguments:'.format(stage.label)) for arg_val in stage.args: logger.err(' {}'.format(arg_val)) for arg_name, arg_val in stage.kwargs.items(): logger.err(' {}: {}'.format(arg_name, arg_val)) logger.err(str(traceback.format_exc())) retcode = 3 else: raise TypeError("Bad Job Stage: {}".format(stage)) # Set whether this stage succeeded stage_succeeded = (retcode == 0) # Update success tracker from this stage all_stages_succeeded = all_stages_succeeded and stage_succeeded # Store the results from this stage event_queue.put(ExecutionEvent( 'FINISHED_STAGE', job_id=job.jid, stage_label=stage.label, succeeded=stage_succeeded, stdout=logger.get_stdout_log(), stderr=logger.get_stderr_log(), interleaved=logger.get_interleaved_log(), logfile_filename=logger.unique_logfile_name, repro=stage.get_reproduction_cmd(verb, job.jid), retcode=retcode)) # Close logger logger.close() finally: lock.release() # Finally, return whether all stages of the job completed raise asyncio.Return(job.jid, all_stages_succeeded)
def async_job(verb, job, threadpool, locks, event_queue, log_path): """Run a sequence of Stages from a Job and collect their output. :param job: A Job instance :threadpool: A thread pool executor for blocking stages :event_queue: A queue for asynchronous events """ # Initialize success flag all_stages_succeeded = True # Jobs start occuping a jobserver job occupying_job = True # Execute each stage of this job for stage in job.stages: # Logger reference in this scope for error reporting logger = None # Abort the job if one of the stages has failed if job.continue_on_failure and not all_stages_succeeded: break # Check for stage synchronization lock if stage.locked_resource is not None: lock = locks.setdefault(stage.locked_resource, asyncio.Lock()) yield from lock else: lock = FakeLock() try: # If the stage doesn't require a job token, release it temporarily if stage.occupy_job: if not occupying_job: while job_server.try_acquire() is None: yield from asyncio.sleep(0.05) occupying_job = True else: if occupying_job: job_server.release() occupying_job = False # Notify stage started event_queue.put( ExecutionEvent('STARTED_STAGE', job_id=job.jid, stage_label=stage.label)) if type(stage) is CommandStage: try: # Initiate the command while True: try: # Update the environment for this stage (respects overrides) stage.update_env(job.env) # Get the logger protocol_type = stage.logger_factory( verb, job.jid, stage.label, event_queue, log_path) # Start asynchroonous execution transport, logger = yield from ( async_execute_process( protocol_type, **stage.async_execute_process_kwargs)) break except OSError as exc: if 'Text file busy' in str(exc): # This is a transient error, try again shortly # TODO: report the file causing the problem (exc.filename) yield from asyncio.sleep(0.01) continue raise # Notify that a subprocess has been created event_queue.put( ExecutionEvent('SUBPROCESS', job_id=job.jid, stage_label=stage.label, stage_repro=stage.get_reproduction_cmd( verb, job.jid), **stage.async_execute_process_kwargs)) # Asynchronously yield until this command is completed retcode = yield from logger.complete except: # noqa: E722 # Bare except is permissable here because the set of errors which the CommandState might raise # is unbounded. We capture the traceback here and save it to the build's log files. logger = IOBufferLogger(verb, job.jid, stage.label, event_queue, log_path) logger.err(str(traceback.format_exc())) retcode = 3 elif type(stage) is FunctionStage: logger = IOBufferLogger(verb, job.jid, stage.label, event_queue, log_path) try: # Asynchronously yield until this function is completed retcode = yield from get_loop().run_in_executor( threadpool, stage.function, logger, event_queue) except: # noqa: E722 # Bare except is permissable here because the set of errors which the FunctionStage might raise # is unbounded. We capture the traceback here and save it to the build's log files. logger.err('Stage `{}` failed with arguments:'.format( stage.label)) for arg_val in stage.args: logger.err(' {}'.format(arg_val)) for arg_name, arg_val in stage.kwargs.items(): logger.err(' {}: {}'.format(arg_name, arg_val)) retcode = 3 else: raise TypeError("Bad Job Stage: {}".format(stage)) # Set whether this stage succeeded stage_succeeded = (retcode == 0) # Update success tracker from this stage all_stages_succeeded = all_stages_succeeded and stage_succeeded # Store the results from this stage event_queue.put( ExecutionEvent('FINISHED_STAGE', job_id=job.jid, stage_label=stage.label, succeeded=stage_succeeded, stdout=logger.get_stdout_log(), stderr=logger.get_stderr_log(), interleaved=logger.get_interleaved_log(), logfile_filename=logger.unique_logfile_name, repro=stage.get_reproduction_cmd(verb, job.jid), retcode=retcode)) # Close logger logger.close() finally: lock.release() # Finally, return whether all stages of the job completed return (job.jid, all_stages_succeeded)
def execute_jobs( verb, jobs, locks, event_queue, log_path, max_toplevel_jobs=None, continue_on_failure=False, continue_without_deps=False): """Process a number of jobs asynchronously. :param jobs: A list of topologically-sorted Jobs with no circular dependencies. :param event_queue: A python queue for reporting events. :param log_path: The path in which logfiles can be written :param max_toplevel_jobs: Max number of top-level jobs :param continue_on_failure: Keep running jobs even if one fails. :param continue_without_deps: Run jobs even if their dependencies fail. """ # Map of jid -> job job_map = dict([(j.jid, j) for j in jobs]) # Jobs which are not ready to be executed pending_jobs = [] # Jobs which are ready to be executed once workers are available queued_jobs = [] # List of active jobs active_jobs = [] # Set of active job futures active_job_fs = set() # Dict of completd jobs job_id -> succeeded completed_jobs = {} # List of jobs whose deps failed abandoned_jobs = [] # Make sure job server has been initialized if not job_server.initialized(): raise RuntimeError('JobServer has not been initialized.') # Create a thread pool executor for blocking python stages in the asynchronous jobs threadpool = ThreadPoolExecutor(max_workers=job_server.max_jobs()) # Immediately abandon jobs with bad dependencies pending_jobs, new_abandoned_jobs = split(jobs, lambda j: all([d in job_map for d in j.deps])) for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put(ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='MISSING_DEPS', dep_ids=[d for d in abandoned_job.deps if d not in job_map])) # Initialize list of ready and pending jobs (jobs not ready to be executed) queued_jobs, pending_jobs = split(pending_jobs, lambda j: len(j.deps) == 0) # Process all jobs asynchronously until there are none left while len(active_job_fs) + len(queued_jobs) + len(pending_jobs) > 0: # Activate jobs while the jobserver dispenses tokens while ((len(queued_jobs) > 0) and ((max_toplevel_jobs is None) or (len(active_jobs) < max_toplevel_jobs)) and (job_server.try_acquire() is not None)): # Pop a job off of the job queue job = queued_jobs.pop(0) # Label it (for debugging) job_server.add_label(job.jid) # Notify that the job is being started event_queue.put(ExecutionEvent( 'STARTED_JOB', job_id=job.jid)) # Start the job coroutine active_jobs.append(job) active_job_fs.add(async_job(verb, job, threadpool, locks, event_queue, log_path)) # Report running jobs event_queue.put(ExecutionEvent( 'JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs )) # Process jobs as they complete asynchronously done_job_fs, active_job_fs = yield asyncio.From(asyncio.wait( active_job_fs, timeout=0.10, return_when=FIRST_COMPLETED)) for done_job_f in done_job_fs: # Capture a result once the job has finished job_id, succeeded = yield asyncio.From(done_job_f) # Release a jobserver token now that this job has succeeded job_server.release(job_id) active_jobs = [j for j in active_jobs if j.jid != job_id] # Generate event with the results of this job event_queue.put(ExecutionEvent( 'FINISHED_JOB', job_id=job_id, succeeded=succeeded)) # Add the job to the completed list completed_jobs[job_id] = succeeded # Handle failure modes if not succeeded: # Handle different abandoning policies if not continue_on_failure: # Abort all pending jobs if any job fails new_abandoned_jobs = queued_jobs + pending_jobs queued_jobs = [] pending_jobs = [] # Notify that jobs have been abandoned for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put(ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='PEER_FAILED', peer_job_id=job_id)) elif not continue_without_deps: unhandled_abandoned_job_ids = [job_id] # Abandon jobs which depend on abandoned jobs while len(unhandled_abandoned_job_ids) > 0: # Get the abandoned job abandoned_job_id = unhandled_abandoned_job_ids.pop(0) # Abandon all pending jobs which depend on this job_id unhandled_abandoned_jobs, pending_jobs = split( pending_jobs, lambda j: abandoned_job_id in j.deps) # Handle each new abandoned job for abandoned_job in unhandled_abandoned_jobs: abandoned_jobs.append(abandoned_job) # Notify if any jobs have been abandoned event_queue.put(ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='DEP_FAILED', direct_dep_job_id=abandoned_job_id, dep_job_id=job_id)) # Add additional job ids to check unhandled_abandoned_job_ids.extend( [j.jid for j in unhandled_abandoned_jobs]) # Update the list of ready jobs (based on completed job dependencies) new_queued_jobs, pending_jobs = split( pending_jobs, lambda j: j.all_deps_completed(completed_jobs)) queued_jobs.extend(new_queued_jobs) # Notify of newly queued jobs for queued_job in new_queued_jobs: event_queue.put(ExecutionEvent( 'QUEUED_JOB', job_id=queued_job.jid)) # Report running jobs event_queue.put(ExecutionEvent( 'JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs )) raise asyncio.Return(all(completed_jobs.values()))
def execute_jobs(verb, jobs, locks, event_queue, log_path, max_toplevel_jobs=None, continue_on_failure=False, continue_without_deps=False): """Process a number of jobs asynchronously. :param jobs: A list of topologically-sorted Jobs with no circular dependencies. :param event_queue: A python queue for reporting events. :param log_path: The path in which logfiles can be written :param max_toplevel_jobs: Max number of top-level jobs :param continue_on_failure: Keep running jobs even if one fails. :param continue_without_deps: Run jobs even if their dependencies fail. """ # Map of jid -> job job_map = dict([(j.jid, j) for j in jobs]) # Jobs which are not ready to be executed pending_jobs = [] # Jobs which are ready to be executed once workers are available queued_jobs = [] # List of active jobs active_jobs = [] # Set of active job futures active_job_fs = set() # Dict of completd jobs job_id -> succeeded completed_jobs = {} # List of jobs whose deps failed abandoned_jobs = [] # Make sure job server has been initialized if not job_server.initialized(): raise RuntimeError('JobServer has not been initialized.') # Create a thread pool executor for blocking python stages in the asynchronous jobs threadpool = ThreadPoolExecutor(max_workers=job_server.max_jobs()) # Immediately abandon jobs with bad dependencies pending_jobs, new_abandoned_jobs = split( jobs, lambda j: all([d in job_map for d in j.deps])) for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put( ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='MISSING_DEPS', dep_ids=[d for d in abandoned_job.deps if d not in job_map])) # Initialize list of ready and pending jobs (jobs not ready to be executed) queued_jobs, pending_jobs = split(pending_jobs, lambda j: len(j.deps) == 0) # Process all jobs asynchronously until there are none left while len(active_job_fs) + len(queued_jobs) + len(pending_jobs) > 0: # Activate jobs while the jobserver dispenses tokens while ((len(queued_jobs) > 0) and ((max_toplevel_jobs is None) or (len(active_jobs) < max_toplevel_jobs)) and (job_server.try_acquire() is not None)): # Pop a job off of the job queue job = queued_jobs.pop(0) # Label it (for debugging) job_server.add_label(job.jid) # Notify that the job is being started event_queue.put(ExecutionEvent('STARTED_JOB', job_id=job.jid)) # Start the job coroutine active_jobs.append(job) active_job_fs.add( async_job(verb, job, threadpool, locks, event_queue, log_path)) # Report running jobs event_queue.put( ExecutionEvent('JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs)) # Process jobs as they complete asynchronously done_job_fs, active_job_fs = yield from asyncio.wait( active_job_fs, timeout=0.10, return_when=FIRST_COMPLETED) for done_job_f in done_job_fs: # Capture a result once the job has finished job_id, succeeded = yield from done_job_f # Release a jobserver token now that this job has succeeded job_server.release(job_id) active_jobs = [j for j in active_jobs if j.jid != job_id] # Generate event with the results of this job event_queue.put( ExecutionEvent('FINISHED_JOB', job_id=job_id, succeeded=succeeded)) # Add the job to the completed list completed_jobs[job_id] = succeeded # Handle failure modes if not succeeded: # Handle different abandoning policies if not continue_on_failure: # Abort all pending jobs if any job fails new_abandoned_jobs = queued_jobs + pending_jobs queued_jobs = [] pending_jobs = [] # Notify that jobs have been abandoned for abandoned_job in new_abandoned_jobs: abandoned_jobs.append(abandoned_job) event_queue.put( ExecutionEvent('ABANDONED_JOB', job_id=abandoned_job.jid, reason='PEER_FAILED', peer_job_id=job_id)) elif not continue_without_deps: unhandled_abandoned_job_ids = [job_id] # Abandon jobs which depend on abandoned jobs while len(unhandled_abandoned_job_ids) > 0: # Get the abandoned job abandoned_job_id = unhandled_abandoned_job_ids.pop(0) # Abandon all pending jobs which depend on this job_id unhandled_abandoned_jobs, pending_jobs = split( pending_jobs, lambda j: abandoned_job_id in j.deps) # Handle each new abandoned job for abandoned_job in unhandled_abandoned_jobs: abandoned_jobs.append(abandoned_job) # Notify if any jobs have been abandoned event_queue.put( ExecutionEvent( 'ABANDONED_JOB', job_id=abandoned_job.jid, reason='DEP_FAILED', direct_dep_job_id=abandoned_job_id, dep_job_id=job_id)) # Add additional job ids to check unhandled_abandoned_job_ids.extend( [j.jid for j in unhandled_abandoned_jobs]) # Update the list of ready jobs (based on completed job dependencies) new_queued_jobs, pending_jobs = split( pending_jobs, lambda j: j.all_deps_completed(completed_jobs)) queued_jobs.extend(new_queued_jobs) # Notify of newly queued jobs for queued_job in new_queued_jobs: event_queue.put( ExecutionEvent('QUEUED_JOB', job_id=queued_job.jid)) # Report running jobs event_queue.put( ExecutionEvent('JOB_STATUS', pending=[j.jid for j in pending_jobs], queued=[j.jid for j in queued_jobs], active=[j.jid for j in active_jobs], abandoned=[j.jid for j in abandoned_jobs], completed=completed_jobs)) return all(completed_jobs.values())