def requeue_processor_job(last_job: ProcessorJob) -> None: """Queues a new processor job. The new processor job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 # The Salmon pipeline is quite RAM-sensitive. # Try it again with an increased RAM amount, if possible. new_ram_amount = last_job.ram_amount # These initial values are set in common/job_lookup.py:determine_ram_amount if last_job.pipeline_applied == "SALMON": if new_ram_amount == 12288: new_ram_amount = 16384 elif new_ram_amount == 16384: new_ram_amount = 32768 # The AFFY pipeline is somewhat RAM-sensitive. # Try it again with an increased RAM amount, if possible. elif last_job.pipeline_applied == "AFFY_TO_PCL": if new_ram_amount == 2048: new_ram_amount = 4096 elif new_ram_amount == 4096: new_ram_amount = 8192 new_job = ProcessorJob(num_retries=num_retries, pipeline_applied=last_job.pipeline_applied, ram_amount=new_ram_amount, volume_index=last_job.volume_index) new_job.save() for original_file in last_job.original_files.all(): ProcessorJobOriginalFileAssociation.objects.get_or_create( processor_job=new_job, original_file=original_file) for dataset in last_job.datasets.all(): ProcessorJobDatasetAssociation.objects.get_or_create( processor_job=new_job, dataset=dataset) try: logger.debug( "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id) if send_job(ProcessorPipeline[last_job.pipeline_applied], job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete() except: logger.error( "Failed to requeue Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id) # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete()
def requeue_job(job, volume_index): """Requeues a job regardless of whether it is a DownloaderJob or ProcessorJob. This function reuses a lot of logic from requeue_downloader_job and requeue_processor_job from the main namespace for the Foreman. However there's additional logic there that we don't want, because we explicitly want to requeue these jobs regardless of how many times they've been retried. """ # All new jobs are going to be set at 2 retries so they only get # tried once. Presumably all of these have failed at least once # already, so there may be a good reason. num_retries = 1 if isinstance(job, ProcessorJob): new_job = ProcessorJob( num_retries=num_retries, pipeline_applied=job.pipeline_applied, ram_amount=job.ram_amount, volume_index=volume_index, ) new_job.save() for original_file in job.original_files.all(): ProcessorJobOriginalFileAssociation.objects.get_or_create( processor_job=new_job, original_file=original_file ) job_type = ProcessorPipeline[job.pipeline_applied] elif isinstance(job, DownloaderJob): new_job = DownloaderJob( num_retries=num_retries, downloader_task=job.downloader_task, accession_code=job.accession_code, ) new_job.save() for original_file in job.original_files.all(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=new_job, original_file=original_file ) job_type = Downloaders[job.downloader_task] else: raise ValueError("Told to requeue a job that's not a ProcessorJob nor DownloaderJob!") try: # Only dispatch a job to Nomad immediately if it's a processor # job so that the Foreman can control the flow of # DownloaderJobs. dispatch_immediately = isinstance(job, ProcessorJob) if send_job(job_type, job=new_job, is_dispatch=dispatch_immediately): job.retried = True job.success = False job.retried_job = new_job job.save() else: logger.error( ( "Failed to requeue %s which had ID %d with a new %s " "with ID %d because send_job returned false." ), type(job).__name__, job.id, type(job).__name__, new_job.id, ) # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete() return False except Exception: logger.exception( ( "Failed to requeue %s which had ID %d with a new %s " "with ID %d because send_job raised an exception." ), type(job).__name__, job.id, type(job).__name__, new_job.id, ) # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete() return False return True
def requeue_processor_job(last_job: ProcessorJob) -> None: """Queues a new processor job. The new processor job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 # The Salmon pipeline is quite RAM-sensitive. # Try it again with an increased RAM amount, if possible. new_ram_amount = last_job.ram_amount # If there's no start time then it's likely that the instance got # cycled which means we didn't get OOM-killed, so we don't need to # increase the RAM amount. if last_job.start_time: # There's only one size of tximport jobs. if last_job.pipeline_applied == ProcessorPipeline.TXIMPORT.value: new_ram_amount = 32768 # These initial values are set in common/job_lookup.py:determine_ram_amount elif last_job.pipeline_applied in [ ProcessorPipeline.SALMON.value, ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG.value, ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT.value, ProcessorPipeline.QN_REFERENCE.value, ]: if last_job.ram_amount == 4096: new_ram_amount = 8192 if last_job.ram_amount == 8192: new_ram_amount = 12288 elif last_job.ram_amount == 12288: new_ram_amount = 16384 elif last_job.ram_amount == 16384: new_ram_amount = 32768 elif last_job.ram_amount == 32768: new_ram_amount = 65536 # The AFFY pipeline is somewhat RAM-sensitive. # Also NO_OP can fail and be retried, so we want to attempt ramping up ram. # Try it again with an increased RAM amount, if possible. elif (last_job.pipeline_applied == ProcessorPipeline.AFFY_TO_PCL.value or last_job.pipeline_applied == ProcessorPipeline.NO_OP.value): if last_job.ram_amount == 2048: new_ram_amount = 4096 elif last_job.ram_amount == 4096: new_ram_amount = 8192 elif last_job.ram_amount == 8192: new_ram_amount = 32768 elif (last_job.pipeline_applied == ProcessorPipeline.ILLUMINA_TO_PCL.value and "non-zero exit status -9" in last_job.failure_reason): if last_job.ram_amount == 2048: new_ram_amount = 4096 elif last_job.ram_amount == 4096: new_ram_amount = 8192 new_job = ProcessorJob( downloader_job=last_job.downloader_job, num_retries=num_retries, pipeline_applied=last_job.pipeline_applied, ram_amount=new_ram_amount, batch_job_queue=last_job.batch_job_queue, ) new_job.save() for original_file in last_job.original_files.all(): ProcessorJobOriginalFileAssociation.objects.get_or_create( processor_job=new_job, original_file=original_file) for dataset in last_job.datasets.all(): ProcessorJobDatasetAssociation.objects.get_or_create( processor_job=new_job, dataset=dataset) try: logger.debug( "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id, ) if send_job(ProcessorPipeline[last_job.pipeline_applied], job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with Batch just now, leave the job for a later loop. new_job.delete() return False except Exception: logger.warn( "Failed to requeue Processor Job which had ID %d with a new Processor Job with ID %d.", last_job.id, new_job.id, exc_info=1, ) # Can't communicate with Batch just now, leave the job for a later loop. new_job.delete() return False return True