示例#1
0
    def queue_downloader_job_for_original_files(
        self,
        original_files: List[OriginalFile],
        experiment_accession_code: str = None,
        is_transcriptome: bool = False,
    ):
        """Creates a single DownloaderJob with multiple files to download.
        """
        # Transcriptome is a special case because there's no sample_object.
        # It's alright to re-process transcriptome indices.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            source_urls = [
                original_file.source_url for original_file in original_files
            ]
            # There is already a downloader job associated with this file.
            old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url__in=source_urls).count()
            if old_assocs_count > 0:
                logger.debug(
                    "We found an existing DownloaderJob for these urls.",
                    source_urls=source_urls)
                return False

            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info(
                "No valid downloader task found for sample.",
                sample=sample_object.id,
                original_file=original_files[0].id,
            )
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info(
                    "Queuing downloader job.",
                    survey_job=self.survey_job.id,
                    downloader_job=downloader_job.id,
                    downloaded_urls=downloaded_urls,
                )
                message_queue.send_job(downloader_task, downloader_job)
            except:
                # If we fail to queue the job, it will be requeued.
                pass
示例#2
0
    def queue_downloader_job_for_original_files(
            self,
            original_files: List[OriginalFile],
            experiment_accession_code: str = None,
            is_transcriptome: bool = False):
        """Creates a single DownloaderJob with multiple files to download.
        """
        source_urls = [
            original_file.source_url for original_file in original_files
        ]
        # There is already a downloader job associated with this file.
        old_assocs = DownloaderJobOriginalFileAssociation.objects.filter(
            original_file__source_url__in=source_urls)
        if len(old_assocs) > 0:
            logger.debug("We found an existing DownloaderJob for these urls.",
                         source_urls=source_urls)
            return False

        # Transcriptome is a special case because there's no sample_object.
        if is_transcriptome:
            downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX
        else:
            sample_object = original_files[0].samples.first()
            downloader_task = job_lookup.determine_downloader_task(
                sample_object)

        if downloader_task == job_lookup.Downloaders.NONE:
            logger.info("No valid downloader task found for sample.",
                        sample=sample_object.id,
                        original_file=original_files[0].id)
        else:
            downloader_job = DownloaderJob()
            downloader_job.downloader_task = downloader_task.value
            downloader_job.accession_code = experiment_accession_code
            downloader_job.save()

            downloaded_urls = []
            for original_file in original_files:
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                downloaded_urls.append(original_file.source_url)

            try:
                logger.info("Queuing downloader job.",
                            survey_job=self.survey_job.id,
                            downloader_job=downloader_job.id,
                            downloaded_urls=downloaded_urls)
                message_queue.send_job(downloader_task, downloader_job)
            except Exception as e:
                # If the task doesn't get sent we don't want the
                # downloader_job to be left floating
                logger.exception("Failed to enqueue downloader job.",
                                 survey_job=self.survey_job.id,
                                 downloader_job=downloader_job.id,
                                 error=str(e))
                downloader_job.success = False
                downloader_job.failure_reason = str(e)
                downloader_job.save()
    def queue_downloader_jobs(self, experiment: Experiment,
                              samples: List[Sample]):
        """This enqueues DownloaderJobs on a per-file basis.

        There is a complementary function below for enqueueing multi-file
        DownloaderJobs.
        """
        files_to_download = []
        for sample in samples:
            files_for_sample = OriginalFile.objects.filter(sample=sample,
                                                           is_downloaded=False)
            for og_file in files_for_sample:
                files_to_download.append(og_file)

        download_urls_with_jobs = {}
        for original_file in files_to_download:

            # We don't need to create multiple downloaders for the same file.
            # However, we do want to associate original_files with the
            # DownloaderJobs that will download them.
            if original_file.source_url in download_urls_with_jobs.keys():
                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=download_urls_with_jobs[
                        original_file.source_url],
                    original_file=original_file,
                )
                continue

            # There is already a downloader job associated with this file.
            old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter(
                original_file__source_url=original_file.source_url).count()
            if old_assocs_count > 0:
                logger.debug(
                    "We found an existing DownloaderJob for this file/url.",
                    original_file_id=original_file.id,
                )
                continue

            sample_object = original_file.samples.first()
            downloader_task = determine_downloader_task(sample_object)

            if downloader_task == Downloaders.NONE:
                logger.info(
                    "No valid downloader task found for sample.",
                    sample=sample_object.id,
                    original_file=original_file.id,
                )
            else:
                downloader_job = DownloaderJob()
                downloader_job.downloader_task = downloader_task.value
                downloader_job.accession_code = experiment.accession_code
                downloader_job.save()

                DownloaderJobOriginalFileAssociation.objects.get_or_create(
                    downloader_job=downloader_job, original_file=original_file)

                download_urls_with_jobs[
                    original_file.source_url] = downloader_job

                try:
                    logger.info(
                        "Queuing downloader job for URL: " +
                        original_file.source_url,
                        survey_job=self.survey_job.id,
                        downloader_job=downloader_job.id,
                    )
                    send_job(downloader_task, downloader_job)
                except Exception:
                    # If we fail to queue the job, it will be requeued.
                    pass
示例#4
0
def create_downloader_job(undownloaded_files: OriginalFile, processor_job_id: int) -> bool:
    """Creates a downloader job to download `undownloaded_files`."""
    if not undownloaded_files:
        return False

    original_downloader_job = None
    archive_file = None
    for undownloaded_file in undownloaded_files:
        try:
            original_downloader_job = undownloaded_file.downloader_jobs.latest('id')

            # Found the job so we don't need to keep going.
            break
        except DownloaderJob.DoesNotExist:
            # If there's no association between this file and any
            # downloader jobs, it's most likely because the original
            # file was created after extracting a archive containing
            # multiple files worth of data.
            # The way to handle this is to find that archive and
            # recreate a downloader job FOR THAT. That archive will
            # have the same filename as the file at the end of the
            # 'source_url' field, because that source URL is pointing
            # to the archive we need.
            archive_filename = undownloaded_file.source_url.split("/")[-1]

            # This file or its job might not exist, but we'll wait
            # until we've checked all the files before calling it a
            # failure.
            try:
                archive_file = OriginalFile.objects.filter(filename=archive_filename)
                if archive_file.count() > 0:
                    archive_file = archive_file.first()
                else:
                    # We might need to match these up based on
                    # source_filenames rather than filenames so just
                    # try them both.
                    archive_file = OriginalFile.objects.filter(source_filename=archive_filename).first()

                original_downloader_job = DownloaderJobOriginalFileAssociation.objects.filter(
                    original_file=archive_file
                ).latest('id').downloader_job
                # Found the job so we don't need to keep going.
                break
            except:
                pass

    if not original_downloader_job:
        sample_object = list(undownloaded_files)[0].samples.first()
        if sample_object:
            downloader_task = job_lookup.determine_downloader_task(sample_object)

            if downloader_task == job_lookup.Downloaders.NONE:
                logger.warn(("No valid downloader task found for sample, which is weird"
                             " because it was able to have a processor job created for it..."),
                            sample=sample_object.id)
                return False
            else:
                # determine_downloader_task returns an enum object,
                # but we wanna set this on the DownloaderJob object so
                # we want the actual value.
                downloader_task = downloader_task.value

            accession_code = sample_object.accession_code
            original_files = sample_object.original_files.all()
        else:
            logger.error(
                "Could not find the original DownloaderJob or Sample for these files.",
                undownloaded_file=undownloaded_files
            )
            return False
    elif original_downloader_job.was_recreated:
        logger.warn(
            "Downloader job has already been recreated once, not doing it again.",
            original_downloader_job=original_downloader_job,
            undownloaded_files=undownloaded_files
        )
        return False
    else:
        downloader_task = original_downloader_job.downloader_task
        accession_code = original_downloader_job.accession_code
        original_files = original_downloader_job.original_files.all()

        sample_object = original_files[0].samples.first()

    new_job = DownloaderJob()
    new_job.downloader_task = downloader_task
    new_job.accession_code = accession_code
    new_job.was_recreated = True
    new_job.ram_amount = 1024
    new_job.save()

    if archive_file:
        # If this downloader job is for an archive file, then the
        # files that were passed into this function aren't what need
        # to be directly downloaded, they were extracted out of this
        # archive. The DownloaderJob will re-extract them and set up
        # the associations for the new ProcessorJob.
        # So double check that it still needs downloading because
        # another file that came out of it could have already
        # recreated the DownloaderJob.
        if archive_file.needs_downloading(processor_job_id):
            if archive_file.is_downloaded:
                # If it needs to be downloaded then it's not
                # downloaded and the is_downloaded field should stop
                # lying about that.
                archive_file.is_downloaded = False
                archive_file.save()

            DownloaderJobOriginalFileAssociation.objects.get_or_create(
                downloader_job=new_job,
                original_file=archive_file
            )
    else:
        # We can't just associate the undownloaded files, because
        # there's a chance that there is a file which actually is
        # downloaded that also needs to be associated with the job.
        for original_file in original_files:
            DownloaderJobOriginalFileAssociation.objects.get_or_create(
                downloader_job=new_job,
                original_file=original_file
            )

    return True