Exemplo n.º 1
0
def requeue_sample(sample, dry_run=False):
    sample_was_requeued = False
    if sample.is_processed:
        has_live_computational_results = False
        for result in sample.results.all():
            live_files = result.computedfile_set.filter(
                s3_bucket__isnull=False, s3_key__isnull=False)
            if live_files.count() >= 1:
                has_live_computational_results = True

        live_computed_files = sample.computed_files.filter(
            s3_bucket__isnull=False, s3_key__isnull=False)

        if not (has_live_computational_results
                or live_computed_files.count() > 1):
            sample_was_requeued = True
            if not dry_run:
                # There's no live computed files, the sample
                # should not have been marked processed.
                sample.is_processed = False
                sample.save()

                create_downloader_job(sample.original_files.all(), force=True)

    return sample_was_requeued
Exemplo n.º 2
0
def update_salmon_versions(experiment: Experiment):
    quant_results = get_quant_results_for_experiment(experiment,
                                                     filter_old_versions=False)

    # We first need to find the last created result and get its salmon version.
    # get_quant_results_for_experiment returns a set, not a queryset.
    last_created = None
    for quant_result in quant_results:
        if not last_created:
            last_created = quant_result
        else:
            if quant_result.created_at > last_created.created_at:
                last_created = quant_result

    latest_salmon_version = last_created.organism_index.salmon_version

    total_samples_queued = 0
    for quant_result in quant_results:
        if latest_salmon_version != quant_result.organism_index.salmon_version:
            # we found a quant result associated with an experiment where we need to run salmon
            # hopefully each computational result is associated with a single sample
            for sample in quant_result.samples.all():
                original_files = list(sample.original_files.all())

                if not len(original_files):
                    continue

                # Ensure that there's no processor jobs for these original files that the foreman
                # might want to retry (failed | hung | lost)
                has_open_processor_job = (ProcessorJob.objects.filter(
                    original_files=original_files[0],
                    pipeline_applied=ProcessorPipeline.SALMON).filter(
                        Q(success=False, retried=False, no_retry=False)
                        | Q(
                            success=None,
                            retried=False,
                            no_retry=False,
                            start_time__isnull=False,
                            end_time=None,
                            batch_job_id__isnull=False,
                        )
                        | Q(
                            success=None,
                            retried=False,
                            no_retry=False,
                            start_time=None,
                            end_time=None,
                        )).exists())
                if has_open_processor_job:
                    continue

                create_downloader_job(original_files, force=True)
                total_samples_queued += 1

    logger.info(
        "Re-ran Salmon for %d samples in experiment %s.",
        total_samples_queued,
        experiment.accession_code,
    )
Exemplo n.º 3
0
def update_salmon_versions(experiment: Experiment):
    quant_results = (get_quant_results_for_experiment(
        experiment, filter_old_versions=False).order_by(
            "-organism_index__created_at").prefetch_related(
                "organism_index").prefetch_related("samples__original_files"))

    total_samples_queued = 0
    latest_salmon_version = None
    for quant_result in quant_results:
        if not latest_salmon_version:
            # we can safely ignore the latest salmon version, that will be the first
            # quant result. Note we are ordering by -organism_index__created_at
            latest_salmon_version = quant_result.organism_index.salmon_version
        elif latest_salmon_version != quant_result.organism_index.salmon_version:
            # we found a quant result associated with an experiment where we need to run salmon
            # hopefully each computational result is associated with a single sample
            for sample in quant_result.samples.all():
                original_files = list(sample.original_files.all())

                if not len(original_files):
                    continue

                # Ensure that there's no processor jobs for these original files that the foreman
                # might want to retry (failed | hung | lost)
                has_open_processor_job = (ProcessorJob.objects.filter(
                    original_files=original_files[0],
                    pipeline_applied=ProcessorPipeline.SALMON).filter(
                        Q(success=False, retried=False, no_retry=False)
                        | Q(
                            success=None,
                            retried=False,
                            no_retry=False,
                            start_time__isnull=False,
                            end_time=None,
                            nomad_job_id__isnull=False,
                        )
                        | Q(
                            success=None,
                            retried=False,
                            no_retry=False,
                            start_time=None,
                            end_time=None,
                        )).exists())
                if has_open_processor_job:
                    continue

                create_downloader_job(original_files, force=True)
                total_samples_queued += 1

    logger.info(
        "Re-ran Salmon for %d samples in experiment %s.",
        total_samples_queued,
        experiment.accession_code,
    )
    def handle(self, *args, **options):
        """ Requeues downloader jobs for samples that haven't been processed and their original files
        have no no downloader jobs associated with them
        """
        supported_microarray_platforms = [
            x["platform_accession"] for x in get_supported_microarray_platforms()
        ]
        supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()]
        all_supported_platforms = (
            supported_microarray_platforms + supported_rnaseq_platforms
        )  # https://www.postgresql.org/docs/9.1/functions-array.html

        # Ensure selected samples have valid platforms
        samples_without_downloader = (
            Sample.objects.all()
            .filter(platform_accession_code__in=all_supported_platforms)
            .annotate(
                original_files_count=Count("original_files"),
                downloader_job_count=Count("original_files__downloader_jobs"),
            )
            .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0)
        )
        if options.get("created_after", None):
            samples_without_downloader = samples_without_downloader.filter(
                created_at__gt=options["created_after"]
            )

        samples_without_downloader = samples_without_downloader.prefetch_related("original_files")

        logger.info(
            "Found %d samples without downloader jobs, starting to create them now.",
            samples_without_downloader.count(),
        )

        paginator = Paginator(samples_without_downloader, PAGE_SIZE)
        page = paginator.page()

        while True:
            for sample in page.object_list:
                logger.debug("Creating downloader job for a sample.", sample=sample.accession_code)
                create_downloader_job(sample.original_files.all())

            logger.info(
                "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE
            )

            if not page.has_next():
                break

            page = paginator.page(page.next_page_number())
Exemplo n.º 5
0
def prepare_original_files(job_context):
    """ Provision in the Job context for OriginalFile-driven processors
    """
    job = job_context["job"]
    original_files = job.original_files.all()

    if original_files.count() == 0:
        raise ProcessorJobError("No files were found for the job.", success=False)

    undownloaded_files = set()
    for original_file in original_files:
        if original_file.needs_downloading(job_context["job_id"]):
            if original_file.is_downloaded:
                # If it needs to be downloaded then it's not
                # downloaded and the is_downloaded field should stop
                # lying about that.
                original_file.is_downloaded = False
                original_file.save()

            undownloaded_files.add(original_file)

    if undownloaded_files:
        logger.info(
            (
                "One or more files found which were missing or not downloaded."
                " Creating downloader jobs for them and deleting this job."
            ),
            processor_job=job.id,
            missing_files=list(undownloaded_files),
        )

        was_job_created = create_downloader_job(
            undownloaded_files, processor_job_id=job_context["job_id"], force=True
        )
        if not was_job_created:
            raise ProcessorJobError(
                "Missing file for processor job but unable to recreate downloader jobs!",
                success=False,
            )

        raise ProcessorJobError(
            "We can not process the data because it is not on the disk",
            success=False,
            no_retry=True,  # this job should not be retried again
            abort=True,  # abort the job and don't do anything else
            undownloaded_files=[file.id for file in undownloaded_files],
        )

    job_context["original_files"] = original_files
    first_original_file = original_files.first()
    samples = Sample.objects.filter(original_files=first_original_file)
    job_context["samples"] = samples
    job_context["computed_files"] = []

    return job_context
Exemplo n.º 6
0
def requeue_samples(eligible_samples):
    paginator = Paginator(eligible_samples, PAGE_SIZE)
    page = paginator.page()

    creation_count = 0
    while True:
        for sample in page.object_list:
            if create_downloader_job(sample.original_files.all(), force=True):
                creation_count += 1

        if not page.has_next():
            break
        else:
            page = paginator.page(page.next_page_number())

        logger.info("Creating new downloader jobs. %d so far", creation_count)

        # 2000 samples queued up every five minutes should be fast
        # enough and also not thrash the DB.
        time.sleep(60 * 5)

    return creation_count