Пример #1
0
    def handle(self, *args, **options):
        # Create all the dummy data that would have been created
        # before a downloader job could have been generated.
        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        survey_job.save()

        batch = Batch(survey_job=survey_job,
                      source_type="ARRAY_EXPRESS",
                      pipeline_required="AFFY_TO_PCL",
                      platform_accession_code="A-AFFY-141",
                      experiment_accession_code="E-GEOD-59071",
                      experiment_title="It doesn't really matter.",
                      organism_id=9606,
                      organism_name="H**O SAPIENS",
                      release_date="2017-05-05",
                      last_uploaded_date="2017-05-05",
                      status=BatchStatuses.NEW.value)
        batch.save()

        file = File(
            batch=batch,
            size_in_bytes=0,
            download_url=
            "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip",  # noqa
            raw_format="CEL",
            processed_format="PCL",
            name="GSM1426072_CD_colon_active_2.CEL",
            internal_location="A-AFFY-141/AFFY_TO_PCL")
        file.save()

        downloader_job = DownloaderJob.create_job_and_relationships(
            batches=[batch])
        send_job(Downloaders["ARRAY_EXPRESS"], downloader_job.id)
Пример #2
0
def create_quantpendia(organisms, organisms_exclude):
    all_organisms = Organism.objects.all()
    if organisms:
        organisms = organisms.upper().replace(" ", "_").split(",")
        all_organisms = all_organisms.filter(name__in=organisms)

    if organisms_exclude:
        organisms = organisms_exclude.upper().replace(" ", "_").split(",")
        all_organisms = all_organisms.exclude(name__in=organisms)

    logger.debug("Generating quantpendia for organisms", organisms=all_organisms)

    created_jobs = []
    for organism in all_organisms:
        # only generate the quantpendia for organisms that have some samples
        # with quant.sf files.
        has_quantsf_files = organism.sample_set.filter(
            technology="RNA-SEQ", results__computedfile__filename="quant.sf"
        ).exists()
        if not has_quantsf_files:
            continue

        job = create_job_for_organism(organism)
        logger.info(
            "Sending compendia job for Organism", job_id=str(job.pk), organism=str(organism)
        )
        send_job(ProcessorPipeline.CREATE_QUANTPENDIA, job)

        created_jobs.append(job)

    return created_jobs
Пример #3
0
def create_long_and_short_processor_jobs(files_to_process):
    """ Creates two processor jobs for the files needed for this transcriptome"""

    processor_job_long = ProcessorJob()
    processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG"
    processor_job_long.ram_amount = 8192
    processor_job_long.save()

    for original_file in files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_long
        assoc.save()

    send_job(ProcessorPipeline[processor_job_long.pipeline_applied],
             processor_job_long)

    processor_job_short = ProcessorJob()
    processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT"
    processor_job_short.ram_amount = 8192
    processor_job_short.save()

    for original_file in files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_short
        assoc.save()

    send_job(ProcessorPipeline[processor_job_short.pipeline_applied],
             processor_job_short)
Пример #4
0
def create_compendia(svd_algorithm, organisms):
    """Create a compendium for one or more organisms."""

    svd_algorithm_choices = ["ARPACK", "RANDOMIZED", "NONE"]
    if svd_algorithm and svd_algorithm not in svd_algorithm_choices:
        raise Exception(
            "Invalid svd_algorithm option provided. Possible values are " +
            str(svd_algorithm_choices))

    svd_algorithm = svd_algorithm or "ARPACK"

    target_organisms = get_target_organisms(organisms)
    grouped_organisms = group_organisms_by_biggest_platform(target_organisms)

    logger.debug("Generating compendia for organisms",
                 organism_groups=str(grouped_organisms))

    created_jobs = []
    for organism in grouped_organisms:
        job = create_job_for_organism(organism, svd_algorithm)
        logger.info("Sending compendia job for Organism",
                    job_id=str(job.pk),
                    organism=str(organism))
        send_job(ProcessorPipeline.CREATE_COMPENDIA, job)

        created_jobs.append(job)

    return created_jobs
def create_processor_job_for_original_files(original_files: List[OriginalFile],
                                            volume_index: int):
    """
    Create a processor job and queue a processor task for sample related to an experiment.
    """
    # If there's no original files then we've created all the jobs we need to!
    if len(original_files) == 0:
        return
    # For anything that has raw data there should only be one Sample per OriginalFile
    sample_object = original_files[0].samples.first()
    pipeline_to_apply = determine_processor_pipeline(sample_object,
                                                     original_files[0])
    if pipeline_to_apply == ProcessorPipeline.NONE:
        logger.info("No valid processor pipeline found to apply to sample.",
                    sample=sample_object.id,
                    original_file=original_files[0].id)
        for original_file in original_files:
            original_file.delete_local_file()
            original_file.is_downloaded = False
            original_file.save()
    else:
        processor_job = ProcessorJob()
        processor_job.pipeline_applied = pipeline_to_apply.value
        processor_job.ram_amount = determine_ram_amount(
            sample_object, processor_job)
        processor_job.volume_index = volume_index
        processor_job.save()
        for original_file in original_files:
            assoc = ProcessorJobOriginalFileAssociation()
            assoc.original_file = original_file
            assoc.processor_job = processor_job
            assoc.save()
        logger.debug("Queuing processor job.", processor_job=processor_job.id)
        send_job(pipeline_to_apply, processor_job)
Пример #6
0
    def handle(self, *args, **options):
        """Create a quantpendia for one or more organisms."""
        all_organisms = Organism.objects.all()
        if options["organisms"] is not None:
            organisms = options["organisms"].upper().replace(" ",
                                                             "_").split(",")
            all_organisms = all_organisms.filter(name__in=organisms)

        if options["organisms_exclude"]:
            organisms = options["organisms_exclude"].upper().replace(
                " ", "_").split(",")
            all_organisms = all_organisms.exclude(name__in=organisms)

        logger.debug("Generating quantpendia for organisms",
                     organisms=all_organisms)

        for organism in all_organisms:
            # only generate the quantpendia for organisms that have some samples
            # with quant.sf files.
            has_quantsf_files = organism.sample_set.filter(
                technology="RNA-SEQ",
                results__computedfile__filename="quant.sf").exists()
            if not has_quantsf_files:
                continue

            job = create_job_for_organism(organism)
            logger.info("Sending compendia job for Organism",
                        job_id=str(job.pk),
                        organism=str(organism))
            send_job(ProcessorPipeline.CREATE_QUANTPENDIA, job)

        sys.exit(0)
Пример #7
0
def run_tximport():
    """Creates a tximport job for all eligible experiments."""
    eligible_experiments = (Experiment.objects.annotate(
        num_organisms=Count("organisms")).filter(
            num_organisms=1, technology="RNA-SEQ",
            num_processed_samples=0).prefetch_related("samples__results"))

    paginator = Paginator(eligible_experiments, PAGE_SIZE)
    page = paginator.page()

    # Next is to figure out how many samples were processed for
    # each experiment. Should be able to reuse code from salmon
    # cause it does this stuff.
    tximport_pipeline = ProcessorPipeline.TXIMPORT

    while True:
        creation_count = 0

        for experiment in page.object_list:
            quant_results = get_quant_results_for_experiment(experiment)

            if should_run_tximport(experiment, quant_results, True):
                processor_job = ProcessorJob()
                processor_job.pipeline_applied = tximport_pipeline.value
                processor_job.ram_amount = 8192
                # This job doesn't need to run on a specific volume
                # but it uses the same Nomad job as Salmon jobs which
                # do require the volume index.
                processor_job.volume_index = random.choice(
                    list(get_active_volumes()))
                processor_job.save()

                assoc = ProcessorJobOriginalFileAssociation()
                # Any original file linked to any sample of the
                # experiment will work. Tximport is somewhat special
                # in that it doesn't actuallhy use original files so
                # this is just used to point to the experiment.
                assoc.original_file = experiment.samples.all(
                )[0].original_files.all()[0]
                assoc.processor_job = processor_job
                assoc.save()

                creation_count += 1

                try:
                    send_job(tximport_pipeline, processor_job)
                except Exception:
                    # If we cannot queue the job now the Foreman will do
                    # it later.
                    pass

        logger.info(
            "Created %d tximport jobs for experiments past the thresholds.",
            creation_count)

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())
Пример #8
0
    def queue_downloader_job_for_original_files(
        self,
        original_files: List[OriginalFile],
        experiment_accession_code: str = None,
        is_transcriptome: bool = False,
    ):
        """Creates a single DownloaderJob with multiple files to download.
        """
        # Transcriptome is a special case because there's no sample_object.
        # It's alright to re-process transcriptome indices.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            source_urls = [
                original_file.source_url for original_file in original_files
            ]
            # There is already a downloader job associated with this file.
            old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url__in=source_urls).count()
            if old_assocs_count > 0:
                logger.debug(
                    "We found an existing DownloaderJob for these urls.",
                    source_urls=source_urls)
                return False

            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info(
                "No valid downloader task found for sample.",
                sample=sample_object.id,
                original_file=original_files[0].id,
            )
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info(
                    "Queuing downloader job.",
                    survey_job=self.survey_job.id,
                    downloader_job=downloader_job.id,
                    downloaded_urls=downloaded_urls,
                )
                message_queue.send_job(downloader_task, downloader_job)
            except:
                # If we fail to queue the job, it will be requeued.
                pass
Пример #9
0
    def queue_downloader_job_for_original_files(
            self,
            original_files: List[OriginalFile],
            experiment_accession_code: str = None,
            is_transcriptome: bool = False):
        """Creates a single DownloaderJob with multiple files to download.
        """
        source_urls = [
            original_file.source_url for original_file in original_files
        ]
        # There is already a downloader job associated with this file.
        old_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
            original_file__source_url__in=source_urls)
        if len(old_assocs) > 0:
            logger.debug("We found an existing DownloaderJob for these urls.",
                         source_urls=source_urls)
            return False

        # Transcriptome is a special case because there's no sample_object.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info("No valid downloader task found for sample.",
                        sample=sample_object.id,
                        original_file=original_files[0].id)
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info("Queuing downloader job.",
                            survey_job=self.survey_job.id,
                            downloader_job=downloader_job.id,
                            downloaded_urls=downloaded_urls)
                message_queue.send_job(downloader_task, downloader_job)
            except Exception as e:
                # If the task doesn't get sent we don't want the
                # downloader_job to be left floating
                logger.exception("Failed to enqueue downloader job.",
                                 survey_job=self.survey_job.id,
                                 downloader_job=downloader_job.id,
                                 error=str(e))
                downloader_job.success = False
                downloader_job.failure_reason = str(e)
                downloader_job.save()
Пример #10
0
    def run_trasnscriptome_processor(self):
        # Create all the dummy data that would have been created
        # before a processor job could have been generated.
        survey_job = SurveyJob(source_type="TRANSCRIPTOME_INDEX")
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="TRANSCRIPTOME_INDEX",
            pipeline_required="TRANSCRIPTOME_INDEX",
            platform_accession_code="EnsemblPlants",
            experiment_accession_code="aegilops_tauschii",
            experiment_title="It doesn't really matter.",
            organism_id=37682,
            organism_name="AEGILOPS TAUSCHII",
            release_date="2017-11-02",
            last_uploaded_date="2017-11-02",
            status=BatchStatuses.DOWNLOADED.value,
        )
        batch.save()

        kmer_size_property = BatchKeyValue(batch=batch,
                                           key="kmer_size",
                                           value="31")
        kmer_size_property.save()

        gtf_file = File(
            name="aegilops_tauschii_short.gtf.gz",
            download_url=(
                "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/gtf"
                "/aegilops_tauschii/Aegilops_tauschii.ASM34733v1.37.gtf.gz"),
            raw_format="gtf.gz",
            processed_format="tar.gz",
            internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
            size_in_bytes=-1,
            batch=batch)
        gtf_file.save()

        fasta_file = File(
            name="aegilops_tauschii_short.fa.gz",
            download_url=(
                "ftp://ftp.ensemblgenomes.org/pub/release-37/plants/fasta"
                "/aegilops_tauschii/dna/Aegilops_tauschii."
                "ASM34733v1.dna.toplevel.fa.gz"),
            raw_format="fa.gz",
            processed_format="tar.gz",
            internal_location="EnsemblPlants/TRANSCRIPTOME_INDEX",
            size_in_bytes=-1,
            batch=batch)
        fasta_file.save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        logger.info("Queuing a processor job.")
        send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
Пример #11
0
def send_janitor_jobs():
    """Dispatch a Janitor job for each job queue.

    TODO: make this dispatch janitor jobs for all job queues.
    https://github.com/AlexsLemonade/refinebio/issues/2789
    """
    new_job = ProcessorJob(num_retries=0, pipeline_applied="JANITOR", ram_amount=2048)
    new_job.save()
    logger.info("Sending Janitor Job.", job_id=new_job.id)
    try:
        send_job(ProcessorPipeline["JANITOR"], job=new_job, is_dispatch=True)
    except Exception:
        # If we can't dispatch this job, something else has gone wrong, we can get it next loop.
        return
Пример #12
0
def retry_unqueued_survey_jobs() -> None:
    """Retry survey jobs that never made it into the Batch job queue."""
    potentially_lost_jobs = SurveyJob.unqueued_objects.filter(
        created_at__gt=utils.JOB_CREATED_AT_CUTOFF
    ).order_by("created_at")
    paginator = Paginator(potentially_lost_jobs, utils.PAGE_SIZE, "created_at")
    database_page = paginator.page()
    database_page_count = 0

    if len(database_page.object_list) <= 0:
        # No failed jobs, nothing to do!
        return

    queue_capacity = get_capacity_for_jobs()

    if queue_capacity <= 0:
        logger.info("Not handling unqueued survey jobs " "because there is no capacity for them.")

    while queue_capacity > 0:
        for survey_job in database_page.object_list:
            if send_job(SurveyJobTypes.SURVEYOR, job=survey_job, is_dispatch=True):
                queue_capacity -= 1
        else:
            # Can't communicate with Batch just now, leave the job for a later loop.
            break

        if database_page.has_next():
            database_page = paginator.page(database_page.next_page_number())
            database_page_count += 1
            queue_capacity = get_capacity_for_jobs()
        else:
            break
Пример #13
0
def requeue_processor_job(last_job: ProcessorJob) -> None:
    """Queues a new processor job.

    The new processor job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    # The Salmon pipeline is quite RAM-sensitive.
    # Try it again with an increased RAM amount, if possible.
    new_ram_amount = last_job.ram_amount

    # These initial values are set in common/job_lookup.py:determine_ram_amount
    if last_job.pipeline_applied == "SALMON":
        if new_ram_amount == 12288:
            new_ram_amount = 16384
        elif new_ram_amount == 16384:
            new_ram_amount = 32768
    # The AFFY pipeline is somewhat RAM-sensitive.
    # Try it again with an increased RAM amount, if possible.
    elif last_job.pipeline_applied == "AFFY_TO_PCL":
        if new_ram_amount == 2048:
            new_ram_amount = 4096
        elif new_ram_amount == 4096:
            new_ram_amount = 8192

    new_job = ProcessorJob(num_retries=num_retries,
                           pipeline_applied=last_job.pipeline_applied,
                           ram_amount=new_ram_amount,
                           volume_index=last_job.volume_index)
    new_job.save()

    for original_file in last_job.original_files.all():
        ProcessorJobOriginalFileAssociation.objects.get_or_create(
            processor_job=new_job, original_file=original_file)

    for dataset in last_job.datasets.all():
        ProcessorJobDatasetAssociation.objects.get_or_create(
            processor_job=new_job, dataset=dataset)

    try:
        logger.debug(
            "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.",
            last_job.id, new_job.id)
        if send_job(ProcessorPipeline[last_job.pipeline_applied],
                    job=new_job,
                    is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with nomad just now, leave the job for a later loop.
            new_job.delete()
    except:
        logger.error(
            "Failed to requeue Processor Job which had ID %d with a new Processor Job with ID %d.",
            last_job.id, new_job.id)
        # Can't communicate with nomad just now, leave the job for a later loop.
        new_job.delete()
Пример #14
0
def create_processor_jobs_for_original_files(
        original_files: List[OriginalFile],
        downloader_job: DownloaderJob = None):
    """
    Create a processor jobs and queue a processor task for samples related to an experiment.
    """
    for original_file in original_files:
        sample_object = original_file.samples.first()

        if not delete_if_blacklisted(original_file):
            continue

        pipeline_to_apply = determine_processor_pipeline(
            sample_object, original_file)

        if pipeline_to_apply == ProcessorPipeline.NONE:
            logger.info(
                "No valid processor pipeline found to apply to sample.",
                sample=sample_object.id,
                original_file=original_files[0].id)
            original_file.delete_local_file()
            original_file.is_downloaded = False
            original_file.save()
        else:
            processor_job = ProcessorJob()
            processor_job.pipeline_applied = pipeline_to_apply.value
            processor_job.ram_amount = determine_ram_amount(
                sample_object, processor_job)
            processor_job.save()

            assoc = ProcessorJobOriginalFileAssociation()
            assoc.original_file = original_file
            assoc.processor_job = processor_job
            assoc.save()

            if downloader_job:
                logger.debug("Queuing processor job.",
                             processor_job=processor_job.id,
                             original_file=original_file.id,
                             downloader_job=downloader_job.id)
            else:
                logger.debug("Queuing processor job.",
                             processor_job=processor_job.id,
                             original_file=original_file.id)

            send_job(pipeline_to_apply, processor_job)
Пример #15
0
def create_long_and_short_processor_jobs(downloader_job, long_files_to_process,
                                         short_files_to_process):
    """ Creates two processor jobs for the files needed for this transcriptome"""

    processor_job_long = ProcessorJob()
    processor_job_long.downloader_job = downloader_job
    processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG"
    processor_job_long.ram_amount = 4096
    processor_job_long.save()

    for original_file in long_files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_long
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_long.pipeline_applied],
                 processor_job_long)
    except Exception:
        # This is fine, the foreman will requeue these later.
        logger.exception(
            "Problem with submitting a long transcriptome index job.")

    processor_job_short = ProcessorJob()
    processor_job_short.downloader_job = downloader_job
    processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT"
    processor_job_short.ram_amount = 4096
    processor_job_short.save()

    for original_file in short_files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_short
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_short.pipeline_applied],
                 processor_job_short)
    except Exception:
        # This is fine, the foreman will requeue these later.
        logger.exception(
            "Problem with submitting a long transcriptome index job.")
Пример #16
0
    def run_sra_processor(self):
        # Create all the dummy data that would have been created
        # before a processor job could have been generated.
        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()

        batch = Batch(
            survey_job=survey_job,
            source_type="SRA",
            pipeline_required="SALMON",
            platform_accession_code="IlluminaHiSeq2500",
            experiment_accession_code="PRJEB5018",
            experiment_title="It doesn't really matter.",
            organism_id=10090,
            organism_name="MUS MUSCULUS",
            release_date="2014-03-25",
            last_uploaded_date="2016-05-20",
            status=BatchStatuses.NEW.value,
        )
        batch.save()

        File(name="ERR1680082_1.fastq",
             download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/"
                           "ERR1680082/ERR1680082_1.fastq.gz"),
             raw_format="fastq",
             processed_format="sf",
             internal_location="IlluminaHiSeq2500/SALMON",
             size_in_bytes=2214725074,
             batch=batch).save()

        File(name="ERR1680082_2.fastq",
             download_url=("ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR168/002/"
                           "ERR1680082/ERR1680082_2.fastq.gz"),
             raw_format="fastq",
             processed_format="sf",
             internal_location="IlluminaHiSeq2500/SALMON",
             size_in_bytes=2214725074,
             batch=batch).save()

        processor_job = ProcessorJob.create_job_and_relationships(
            batches=[batch])
        logger.info("Queuing a processor job.")
        send_job(ProcessorPipeline[batch.pipeline_required], processor_job.id)
Пример #17
0
def requeue_survey_job(last_job: SurveyJob) -> None:
    """Queues a new survey job.

    The new survey job will have num_retries one greater than
    last_job.num_retries.
    """

    num_retries = last_job.num_retries + 1

    new_job = SurveyJob(num_retries=num_retries,
                        source_type=last_job.source_type)

    if new_job.num_retries == 1:
        new_job.ram_amount = 4096
    elif new_job.num_retries in [2, 3]:
        new_job.ram_amount = 16384
    else:
        new_job.ram_amount = 1024

    new_job.save()

    keyvalues = SurveyJobKeyValue.objects.filter(survey_job=last_job)

    for keyvalue in keyvalues:
        SurveyJobKeyValue.objects.get_or_create(
            survey_job=new_job,
            key=keyvalue.key,
            value=keyvalue.value,
        )

    logger.debug(
        "Requeuing SurveyJob which had ID %d with a new SurveyJob with ID %d.",
        last_job.id,
        new_job.id,
    )

    try:
        if send_job(SurveyJobTypes.SURVEYOR, job=new_job, is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with Batch just now, leave the job for a later loop.
            new_job.delete()
    except Exception:
        logger.error(
            "Failed to requeue Survey Job which had ID %d with a new Surevey Job with ID %d.",
            last_job.id,
            new_job.id,
        )
        # Can't communicate with AWS just now, leave the job for a later loop.
        new_job.delete()

    return True
Пример #18
0
 def queue_task(processor_job, batch):
     if batch.pipeline_required in ProcessorPipeline.__members__:
         send_job(ProcessorPipeline[batch.pipeline_required],
                  processor_job.id)
         logger.info("Queuing processor job.",
                     downloader_job=job.id,
                     processor_job=processor_job.id,
                     batch=batch.id)
         return True
     else:
         failure_template = "Could not find Processor Pipeline {} in the lookup."
         failure_message = failure_template.format(batch.pipeline_required)
         logger.error(failure_message,
                      downloader_job=job.id,
                      batch=batch.id)
         processor_job.failure_reason = failure_message
         processor_job.success = False
         processor_job.retried = True
         processor_job.save()
         return False
Пример #19
0
def requeue_downloader_job(last_job: DownloaderJob) -> None:
    """Queues a new downloader job.

    The new downloader job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    new_job = DownloaderJob.create_job_and_relationships(
        num_retries=num_retries,
        batches=list(last_job.batches.all()),
        downloader_task=last_job.downloader_task)
    logger.info(
        "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.",
        last_job.id, new_job.id)
    send_job(Downloaders[last_job.downloader_task], new_job.id)

    last_job.retried = True
    last_job.success = False
    last_job.retried_job = new_job
    last_job.save()
Пример #20
0
def requeue_processor_job(last_job: ProcessorJob) -> None:
    """Queues a new processor job.

    The new processor job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    new_job = ProcessorJob.create_job_and_relationships(
        num_retries=num_retries,
        batches=list(last_job.batches.all()),
        pipeline_applied=last_job.pipeline_applied)
    logger.info(
        "Requeuing Processor Job which had ID %d with a new Processor Job with ID %d.",
        last_job.id, new_job.id)
    send_job(ProcessorPipeline[last_job.pipeline_applied], new_job.id)

    last_job.retried = True
    last_job.success = False
    last_job.retried_job = new_job
    last_job.save()
Пример #21
0
    def queue_downloader_jobs(self, batches: List[Batch]):
        if len(batches) > 0:
            downloader_task = self.downloader_task()

            with transaction.atomic():
                downloader_job = DownloaderJob.create_job_and_relationships(
                    batches=batches, downloader_task=downloader_task.value)

            logger.info("Queuing downloader job.",
                        survey_job=self.survey_job.id,
                        downloader_job=downloader_job.id)
            try:
                send_job(downloader_task, downloader_job.id)
            except:
                # If the task doesn't get sent we don't want the
                # downloader_job to be left floating
                downloader_job.delete()
                raise
        else:
            logger.info("Survey job found no new Batches.",
                        survey_job=self.survey_job.id)
Пример #22
0
def create_long_and_short_processor_jobs(files_to_process):
    """ Creates two processor jobs for the files needed for this transcriptome"""

    processor_job_long = ProcessorJob()
    processor_job_long.pipeline_applied = "TRANSCRIPTOME_INDEX_LONG"
    processor_job_long.ram_amount = 4096
    processor_job_long.save()

    for original_file in files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_long
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_long.pipeline_applied],
                 processor_job_long)
    except Exception:
        # This is fine, the foreman will requeue these later.
        pass

    processor_job_short = ProcessorJob()
    processor_job_short.pipeline_applied = "TRANSCRIPTOME_INDEX_SHORT"
    processor_job_short.ram_amount = 4096
    processor_job_short.save()

    for original_file in files_to_process:

        assoc = ProcessorJobOriginalFileAssociation()
        assoc.original_file = original_file
        assoc.processor_job = processor_job_short
        assoc.save()

    try:
        send_job(ProcessorPipeline[processor_job_short.pipeline_applied],
                 processor_job_short)
    except Exception:
        # This is fine, the foreman will requeue these later.
        pass
Пример #23
0
    def handle(self, *args, **options):
        """Create a compendium for one or more organisms."""
        svd_algorithm = options["svd_algorithm"] or "ARPACK"

        svd_algorithm_choices = ["ARPACK", "RANDOMIZED", "NONE"]
        if options["svd_algorithm"] and options["svd_algorithm"] not in svd_algorithm_choices:
            raise Exception(
                "Invalid svd_algorithm option provided. Possible values are "
                + str(svd_algorithm_choices)
            )

        target_organisms = self._get_target_organisms(options)
        grouped_organisms = group_organisms_by_biggest_platform(target_organisms)

        logger.debug("Generating compendia for organisms", organism_groups=str(grouped_organisms))

        for organism in grouped_organisms:
            job = create_job_for_organism(organism, svd_algorithm)
            logger.info(
                "Sending compendia job for Organism", job_id=str(job.pk), organism=str(organism)
            )
            send_job(ProcessorPipeline.CREATE_COMPENDIA, job)
Пример #24
0
def run_tximport_if_eligible(experiment: Experiment,
                             dispatch_jobs=True) -> bool:
    """Checks if an experiment is eligible to have tximport run on it and creates a job for it.

    If the dispatch_jobs parameter is True a Batch job will be dispatched for it.

    Returns the ProcessorJob if a job was created or None if one was not.
    """
    tximport_pipeline = ProcessorPipeline.TXIMPORT

    if get_tximport_inputs_if_eligible(experiment, True):
        processor_job = ProcessorJob()
        processor_job.pipeline_applied = tximport_pipeline.value
        processor_job.ram_amount = 32768
        processor_job.save()

        assoc = ProcessorJobOriginalFileAssociation()
        # Any original file linked to any sample of the
        # experiment will work. Tximport is somewhat special
        # in that it doesn't actuallhy use original files so
        # this is just used to point to the experiment.
        assoc.original_file = experiment.samples.all()[0].original_files.all(
        )[0]
        assoc.processor_job = processor_job
        assoc.save()

        if dispatch_jobs:
            try:
                send_job(tximport_pipeline, processor_job)
            except Exception:
                # If we cannot queue the job now the Foreman will do
                # it later.
                pass

        return processor_job

    return None
Пример #25
0
def send_janitor_jobs():
    """Dispatch a Janitor job for each instance in the cluster"""
    try:
        active_volumes = get_active_volumes()
    except:
        # If we cannot reach Nomad now then we can wait until a later loop.
        pass

    for volume_index in active_volumes:
        new_job = ProcessorJob(num_retries=0,
                               pipeline_applied="JANITOR",
                               ram_amount=2048,
                               volume_index=volume_index)
        new_job.save()
        logger.info("Sending Janitor with index: ",
                    job_id=new_job.id,
                    index=volume_index)
        try:
            send_job(ProcessorPipeline["JANITOR"],
                     job=new_job,
                     is_dispatch=True)
        except Exception as e:
            # If we can't dispatch this job, something else has gone wrong.
            continue
Пример #26
0
    def dispatch_job(self, serializer, obj):
        processor_job = ProcessorJob()
        processor_job.pipeline_applied = "SMASHER"
        processor_job.ram_amount = 4096
        processor_job.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = processor_job
        pjda.dataset = obj
        pjda.save()

        job_sent = False

        try:
            # Hidden method of non-dispatching for testing purposes.
            if not self.request.data.get("no_send_job", False):
                job_sent = send_job(ProcessorPipeline.SMASHER, processor_job)
            else:
                # We didn't actually send it, but we also didn't want to.
                job_sent = True
        except Exception as e:
            # Just log whatever exception happens, because the foreman wil requeue the job anyway
            logger.error(e)

        if not job_sent:
            raise APIException(
                "Unable to queue download job. Something has gone"
                " wrong and we have been notified about it."
            )

        serializer.validated_data["is_processing"] = True
        obj = serializer.save()

        # create a new dataset annotation with the information of this request
        annotation = DatasetAnnotation()
        annotation.dataset = obj
        annotation.data = {
            "start": True,
            "ip": get_client_ip(self.request),
            "user_agent": self.request.META.get("HTTP_USER_AGENT", None),
        }
        annotation.save()
Пример #27
0
def requeue_downloader_job(last_job: DownloaderJob) -> None:
    """Queues a new downloader job.

    The new downloader job will have num_retries one greater than
    last_job.num_retries.
    """
    num_retries = last_job.num_retries + 1

    new_job = DownloaderJob(num_retries=num_retries,
                            downloader_task=last_job.downloader_task,
                            accession_code=last_job.accession_code)
    new_job.save()

    for original_file in last_job.original_files.all():
        DownloaderJobOriginalFileAssociation.objects.get_or_create(
            downloader_job=new_job, original_file=original_file)

    logger.debug(
        "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.",
        last_job.id, new_job.id)
    try:
        if send_job(Downloaders[last_job.downloader_task],
                    job=new_job,
                    is_dispatch=True):
            last_job.retried = True
            last_job.success = False
            last_job.retried_job = new_job
            last_job.save()
        else:
            # Can't communicate with nomad just now, leave the job for a later loop.
            new_job.delete()
    except:
        logger.error(
            "Failed to requeue Downloader Job which had ID %d with a new Downloader Job with ID %d.",
            last_job.id, new_job.id)
        # Can't communicate with nomad just now, leave the job for a later loop.
        new_job.delete()
Пример #28
0
    def handle(self, *args, **options):
        """ Dispatch QN_REFERENCE creation jobs for all Organisms with a platform with enough processed samples. """

        organisms = Organism.objects.all()

        for organism in organisms:
            samples = Sample.processed_objects.filter(
                organism=organism,
                has_raw=True,
                technology="MICROARRAY",
                is_processed=True,
                platform_name__contains="Affymetrix",
            )
            if samples.count() < MIN:
                logger.info(
                    "Total proccessed samples don't meet minimum threshhold",
                    organism=organism,
                    count=samples.count(),
                    min=MIN,
                )
                continue

            platform_counts = (
                samples.values("platform_accession_code").annotate(
                    dcount=Count("platform_accession_code")).order_by(
                        "-dcount"))
            biggest_platform = platform_counts[0]["platform_accession_code"]

            sample_codes_results = Sample.processed_objects.filter(
                platform_accession_code=biggest_platform,
                has_raw=True,
                technology="MICROARRAY",
                organism=organism,
                is_processed=True,
            ).values("accession_code")

            if sample_codes_results.count() < MIN:
                logger.info(
                    "Number of processed samples for largest platform didn't mean threshold.",
                    organism=organism,
                    platform_accession_code=biggest_platform,
                    count=sample_codes_results.count(),
                    min=MIN,
                )
                continue

            sample_codes = [
                res["accession_code"] for res in sample_codes_results
            ]

            dataset = Dataset()
            dataset.data = {
                organism.name + "_(" + biggest_platform + ")": sample_codes
            }
            dataset.aggregate_by = "ALL"
            dataset.scale_by = "NONE"
            dataset.quantile_normalize = False
            dataset.save()

            job = ProcessorJob()
            job.pipeline_applied = "QN_REFERENCE"
            job.save()

            pjda = ProcessorJobDatasetAssociation()
            pjda.processor_job = job
            pjda.dataset = dataset
            pjda.save()

            logger.info("Sending QN_REFERENCE for Organism",
                        job_id=str(job.pk),
                        organism=str(organism))
            send_job(ProcessorPipeline.QN_REFERENCE, job)
Пример #29
0
    def test_transcriptome_redownloading(self, mock_surveyor):
        """Survey, download, then process a transcriptome index. """

        mock_surveyor.side_effect = build_surveyor_init_mock(
            "TRANSCRIPTOME_INDEX")

        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            # I'm not sure why, but sometimes there are already downloader jobs
            # in the database from previous tests even though they should be
            # removed, so pause a bit
            time.sleep(10)
            downloader_jobs = DownloaderJob.objects.all()
            for job in downloader_jobs:
                print(job)
                print(job.accession_code)
            self.assertEqual(downloader_jobs.count(), 0)

            for length in ["LONG", "SHORT"]:
                work_dir_glob = (LOCAL_ROOT_DIR + "/Caenorhabditis_elegans/" +
                                 length + "/processor_job_*")
                for work_dir in glob.glob(work_dir_glob):
                    shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="CAENORHABDITIS_ELEGANS",
                                taxonomy_id=6239,
                                is_scientific_name=True)
            organism.save()

            # Make sure that we can delete the file before the processors begin
            # by preventing the downloaders from sending the processors
            # automatically. We send the jobs manually later
            no_dispatch = EnvironmentVarGuard()
            no_dispatch.set("AUTO_DISPATCH_NOMAD_JOBS", "False")
            with no_dispatch:
                survey_job = surveyor.survey_transcriptome_index(
                    "Caenorhabditis elegans", "Ensembl")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)

            logger.info(
                "Survey Job finished, waiting for Downloader Job with Nomad ID %s to complete.",
                downloader_jobs[0].nomad_job_id,
            )

            downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob,
                                          timezone.now())
            self.assertTrue(downloader_job.success)

            og_file_to_delete = OriginalFile.objects.all()[0]
            os.remove(og_file_to_delete.absolute_file_path)

            processor_jobs = ProcessorJob.objects.all()
            for processor_job in processor_jobs:
                # FIXME: we run these in serial because of
                # https://github.com/AlexsLemonade/refinebio/issues/2321
                send_job(
                    ProcessorPipeline[processor_job.pipeline_applied],
                    job=processor_job,
                    is_dispatch=True,
                )
                try:
                    wait_for_job(processor_job, ProcessorJob, timezone.now())
                except Exception:
                    pass

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should now be two.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 2)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         timezone.now())
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create two
            # processor jobs, one for long and one for short indices.:
            processor_jobs = ProcessorJob.objects.all()
            self.assertEqual(processor_jobs.count(), 4)

            # Wait for the processor jobs to be dispatched
            time.sleep(15)

            # And finally we can make sure that both of the
            # processor jobs were successful, including the one that
            # got recreated.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            successful_processor_jobs = []
            for processor_job in processor_jobs:
                processor_job.refresh_from_db()
                # One of the calls to wait_for_job will fail if the
                # job aborts before it we selected all the
                # processor jobs.
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             timezone.now())
                if processor_job.success:
                    successful_processor_jobs.append(processor_job)

            # While one of the original ProcessorJobs will  be aborted
            # it is hard to be sure of what will happen
            # to the other because of the racing that happens between
            # processor jobs getting started and us deleting the files
            # they need.
            # Therefore, we're just going to verify that one processor
            # job completed successfully for each length, since that
            # is the main thing we need.
            has_long = False
            has_short = False
            for processor_job in successful_processor_jobs:
                if processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_LONG":
                    has_long = True
                elif processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_SHORT":
                    has_short = True

            self.assertTrue(has_long)
            self.assertTrue(has_short)
Пример #30
0
    def queue_downloader_jobs(self, experiment: Experiment,
                              samples: List[Sample]):
        """This enqueues DownloaderJobs on a per-file basis.

        There is a complementary function below for enqueueing multi-file
        DownloaderJobs.
        """
        files_to_download = []
        for sample in samples:
            files_for_sample = OriginalFile.objects.filter(sample=sample,
                                                           is_downloaded=False)
            for og_file in files_for_sample:
                files_to_download.append(og_file)

        download_urls_with_jobs = {}
        for original_file in files_to_download:

            # We don't need to create multiple downloaders for the same file.
            # However, we do want to associate original_files with the
            # DownloaderJobs that will download them.
            if original_file.source_url in download_urls_with_jobs.keys():
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=download_urls_with_jobs[
                        original_file.source_url],
                    original_file=original_file,
                )
                continue

            # There is already a downloader job associated with this file.
            old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url=original_file.source_url).count()
            if old_assocs_count > 0:
                logger.debug(
                    "We found an existing DownloaderJob for this file/url.",
                    original_file_id=original_file.id,
                )
                continue

            sample_object = original_file.samples.first()
            downloader_task = determine_downloader_task(sample_object)

            if downloader_task == Downloaders.NONE:
                logger.info(
                    "No valid downloader task found for sample.",
                    sample=sample_object.id,
                    original_file=original_file.id,
                )
            else:
                downloader_job = DownloaderJob()
                downloader_job.downloader_task = downloader_task.value
                downloader_job.accession_code = experiment.accession_code
                downloader_job.save()

                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                download_urls_with_jobs[
                    original_file.source_url] = downloader_job

                try:
                    logger.info(
                        "Queuing downloader job for URL: " +
                        original_file.source_url,
                        survey_job=self.survey_job.id,
                        downloader_job=downloader_job.id,
                    )
                    send_job(downloader_task, downloader_job)
                except Exception:
                    # If we fail to queue the job, it will be requeued.
                    pass