示例#1
0
def requeue_processor_job(last_job: ProcessorJob) -> None:
    """Queues a new processor job.

    The new processor job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    # The Salmon pipeline is quite RAM-sensitive.
    # Try it again with an increased RAM amount, if possible.
    new_ram_amount = last_job.ram_amount

    # These initial values are set in common/job_lookup.py:determine_ram_amount
    if last_job.pipeline_applied == "SALMON":
        if new_ram_amount == 12288:
            new_ram_amount = 16384
        elif new_ram_amount == 16384:
            new_ram_amount = 32768
    # The AFFY pipeline is somewhat RAM-sensitive.
    # Try it again with an increased RAM amount, if possible.
    elif last_job.pipeline_applied == "AFFY_TO_PCL":
        if new_ram_amount == 2048:
            new_ram_amount = 4096
        elif new_ram_amount == 4096:
            new_ram_amount = 8192

    new_job = ProcessorJob(num_retries=num_retries,
                           pipeline_applied=last_job.pipeline_applied,
                           ram_amount=new_ram_amount,
                           volume_index=last_job.volume_index)
    new_job.save()

    for original_file in last_job.original_files.all():
        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=new_job, original_file=original_file)

    for dataset in last_job.datasets.all():
        ProcessorJobDatasetAssociation.objects.get_or_create(
            processor_job=new_job, dataset=dataset)

    try:
        logger.debug(
            "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.",
            last_job.id, new_job.id)
        if send_job(ProcessorPipeline[last_job.pipeline_applied],
                    job=new_job,
                    is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with nomad just now, leave the job for a later loop.
            new_job.delete()
    except:
        logger.error(
            "Failed to requeue Processor Job which had ID %d with a new Processor Job with ID %d.",
            last_job.id, new_job.id)
        # Can't communicate with nomad just now, leave the job for a later loop.
        new_job.delete()
def requeue_job(job, volume_index):
    """Requeues a job regardless of whether it is a DownloaderJob or ProcessorJob.

    This function reuses a lot of logic from requeue_downloader_job
    and requeue_processor_job from the main namespace for the
    Foreman. However there's additional logic there that we don't
    want, because we explicitly want to requeue these jobs regardless
    of how many times they've been retried.
    """
    # All new jobs are going to be set at 2 retries so they only get
    # tried once. Presumably all of these have failed at least once
    # already, so there may be a good reason.
    num_retries = 1
    if isinstance(job, ProcessorJob):
        new_job = ProcessorJob(
            num_retries=num_retries,
            pipeline_applied=job.pipeline_applied,
            ram_amount=job.ram_amount,
            volume_index=volume_index,
        )
        new_job.save()

        for original_file in job.original_files.all():
            ProcessorJobOriginalFileAssociation.objects.get_or_create(
                processor_job=new_job, original_file=original_file
            )

        job_type = ProcessorPipeline[job.pipeline_applied]
    elif isinstance(job, DownloaderJob):
        new_job = DownloaderJob(
            num_retries=num_retries,
            downloader_task=job.downloader_task,
            accession_code=job.accession_code,
        )
        new_job.save()

        for original_file in job.original_files.all():
            DownloaderJobOriginalFileAssociation.objects.get_or_create(
                downloader_job=new_job, original_file=original_file
            )

        job_type = Downloaders[job.downloader_task]
    else:
        raise ValueError("Told to requeue a job that's not a ProcessorJob nor DownloaderJob!")

    try:
        # Only dispatch a job to Nomad immediately if it's a processor
        # job so that the Foreman can control the flow of
        # DownloaderJobs.
        dispatch_immediately = isinstance(job, ProcessorJob)
        if send_job(job_type, job=new_job, is_dispatch=dispatch_immediately):
            job.retried = True
            job.success = False
            job.retried_job = new_job
            job.save()
        else:
            logger.error(
                (
                    "Failed to requeue %s which had ID %d with a new %s "
                    "with ID %d because send_job returned false."
                ),
                type(job).__name__,
                job.id,
                type(job).__name__,
                new_job.id,
            )
            # Can't communicate with nomad just now, leave the job for a later loop.
            new_job.delete()
            return False
    except Exception:
        logger.exception(
            (
                "Failed to requeue %s which had ID %d with a new %s "
                "with ID %d because send_job raised an exception."
            ),
            type(job).__name__,
            job.id,
            type(job).__name__,
            new_job.id,
        )
        # Can't communicate with nomad just now, leave the job for a later loop.
        new_job.delete()
        return False

    return True
示例#3
0
def requeue_processor_job(last_job: ProcessorJob) -> None:
    """Queues a new processor job.

    The new processor job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    # The Salmon pipeline is quite RAM-sensitive.
    # Try it again with an increased RAM amount, if possible.
    new_ram_amount = last_job.ram_amount

    # If there's no start time then it's likely that the instance got
    # cycled which means we didn't get OOM-killed, so we don't need to
    # increase the RAM amount.
    if last_job.start_time:
        # There's only one size of tximport jobs.
        if last_job.pipeline_applied == ProcessorPipeline.TXIMPORT.value:
            new_ram_amount = 32768
        # These initial values are set in common/job_lookup.py:determine_ram_amount
        elif last_job.pipeline_applied in [
                ProcessorPipeline.SALMON.value,
                ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG.value,
                ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT.value,
                ProcessorPipeline.QN_REFERENCE.value,
        ]:
            if last_job.ram_amount == 4096:
                new_ram_amount = 8192
            if last_job.ram_amount == 8192:
                new_ram_amount = 12288
            elif last_job.ram_amount == 12288:
                new_ram_amount = 16384
            elif last_job.ram_amount == 16384:
                new_ram_amount = 32768
            elif last_job.ram_amount == 32768:
                new_ram_amount = 65536
        # The AFFY pipeline is somewhat RAM-sensitive.
        # Also NO_OP can fail and be retried, so we want to attempt ramping up ram.
        # Try it again with an increased RAM amount, if possible.
        elif (last_job.pipeline_applied == ProcessorPipeline.AFFY_TO_PCL.value
              or last_job.pipeline_applied == ProcessorPipeline.NO_OP.value):
            if last_job.ram_amount == 2048:
                new_ram_amount = 4096
            elif last_job.ram_amount == 4096:
                new_ram_amount = 8192
            elif last_job.ram_amount == 8192:
                new_ram_amount = 32768
        elif (last_job.pipeline_applied
              == ProcessorPipeline.ILLUMINA_TO_PCL.value
              and "non-zero exit status -9" in last_job.failure_reason):
            if last_job.ram_amount == 2048:
                new_ram_amount = 4096
            elif last_job.ram_amount == 4096:
                new_ram_amount = 8192

    new_job = ProcessorJob(
        downloader_job=last_job.downloader_job,
        num_retries=num_retries,
        pipeline_applied=last_job.pipeline_applied,
        ram_amount=new_ram_amount,
        batch_job_queue=last_job.batch_job_queue,
    )
    new_job.save()

    for original_file in last_job.original_files.all():
        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=new_job, original_file=original_file)

    for dataset in last_job.datasets.all():
        ProcessorJobDatasetAssociation.objects.get_or_create(
            processor_job=new_job, dataset=dataset)

    try:
        logger.debug(
            "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.",
            last_job.id,
            new_job.id,
        )
        if send_job(ProcessorPipeline[last_job.pipeline_applied],
                    job=new_job,
                    is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with Batch just now, leave the job for a later loop.
            new_job.delete()
            return False
    except Exception:
        logger.warn(
            "Failed to requeue Processor Job which had ID %d with a new Processor Job with ID %d.",
            last_job.id,
            new_job.id,
            exc_info=1,
        )
        # Can't communicate with Batch just now, leave the job for a later loop.
        new_job.delete()
        return False

    return True