def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False, ): """Creates a single DownloaderJob with multiple files to download. """ # Transcriptome is a special case because there's no sample_object. # It's alright to re-process transcriptome indices. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls).count() if old_assocs_count > 0: logger.debug( "We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info( "No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id, ) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info( "Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls, ) message_queue.send_job(downloader_task, downloader_job) except: # If we fail to queue the job, it will be requeued. pass
def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False): """Creates a single DownloaderJob with multiple files to download. """ source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls) if len(old_assocs) > 0: logger.debug("We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False # Transcriptome is a special case because there's no sample_object. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info("No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info("Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls) message_queue.send_job(downloader_task, downloader_job) except Exception as e: # If the task doesn't get sent we don't want the # downloader_job to be left floating logger.exception("Failed to enqueue downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, error=str(e)) downloader_job.success = False downloader_job.failure_reason = str(e) downloader_job.save()
def queue_downloader_jobs(self, experiment: Experiment, samples: List[Sample]): """This enqueues DownloaderJobs on a per-file basis. There is a complementary function below for enqueueing multi-file DownloaderJobs. """ files_to_download = [] for sample in samples: files_for_sample = OriginalFile.objects.filter(sample=sample, is_downloaded=False) for og_file in files_for_sample: files_to_download.append(og_file) download_urls_with_jobs = {} for original_file in files_to_download: # We don't need to create multiple downloaders for the same file. # However, we do want to associate original_files with the # DownloaderJobs that will download them. if original_file.source_url in download_urls_with_jobs.keys(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=download_urls_with_jobs[ original_file.source_url], original_file=original_file, ) continue # There is already a downloader job associated with this file. old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url=original_file.source_url).count() if old_assocs_count > 0: logger.debug( "We found an existing DownloaderJob for this file/url.", original_file_id=original_file.id, ) continue sample_object = original_file.samples.first() downloader_task = determine_downloader_task(sample_object) if downloader_task == Downloaders.NONE: logger.info( "No valid downloader task found for sample.", sample=sample_object.id, original_file=original_file.id, ) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment.accession_code downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) download_urls_with_jobs[ original_file.source_url] = downloader_job try: logger.info( "Queuing downloader job for URL: " + original_file.source_url, survey_job=self.survey_job.id, downloader_job=downloader_job.id, ) send_job(downloader_task, downloader_job) except Exception: # If we fail to queue the job, it will be requeued. pass
def create_downloader_job(undownloaded_files: OriginalFile, processor_job_id: int) -> bool: """Creates a downloader job to download `undownloaded_files`.""" if not undownloaded_files: return False original_downloader_job = None archive_file = None for undownloaded_file in undownloaded_files: try: original_downloader_job = undownloaded_file.downloader_jobs.latest('id') # Found the job so we don't need to keep going. break except DownloaderJob.DoesNotExist: # If there's no association between this file and any # downloader jobs, it's most likely because the original # file was created after extracting a archive containing # multiple files worth of data. # The way to handle this is to find that archive and # recreate a downloader job FOR THAT. That archive will # have the same filename as the file at the end of the # 'source_url' field, because that source URL is pointing # to the archive we need. archive_filename = undownloaded_file.source_url.split("/")[-1] # This file or its job might not exist, but we'll wait # until we've checked all the files before calling it a # failure. try: archive_file = OriginalFile.objects.filter(filename=archive_filename) if archive_file.count() > 0: archive_file = archive_file.first() else: # We might need to match these up based on # source_filenames rather than filenames so just # try them both. archive_file = OriginalFile.objects.filter(source_filename=archive_filename).first() original_downloader_job = DownloaderJobOriginalFileAssociation.objects.filter( original_file=archive_file ).latest('id').downloader_job # Found the job so we don't need to keep going. break except: pass if not original_downloader_job: sample_object = list(undownloaded_files)[0].samples.first() if sample_object: downloader_task = job_lookup.determine_downloader_task(sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.warn(("No valid downloader task found for sample, which is weird" " because it was able to have a processor job created for it..."), sample=sample_object.id) return False else: # determine_downloader_task returns an enum object, # but we wanna set this on the DownloaderJob object so # we want the actual value. downloader_task = downloader_task.value accession_code = sample_object.accession_code original_files = sample_object.original_files.all() else: logger.error( "Could not find the original DownloaderJob or Sample for these files.", undownloaded_file=undownloaded_files ) return False elif original_downloader_job.was_recreated: logger.warn( "Downloader job has already been recreated once, not doing it again.", original_downloader_job=original_downloader_job, undownloaded_files=undownloaded_files ) return False else: downloader_task = original_downloader_job.downloader_task accession_code = original_downloader_job.accession_code original_files = original_downloader_job.original_files.all() sample_object = original_files[0].samples.first() new_job = DownloaderJob() new_job.downloader_task = downloader_task new_job.accession_code = accession_code new_job.was_recreated = True new_job.ram_amount = 1024 new_job.save() if archive_file: # If this downloader job is for an archive file, then the # files that were passed into this function aren't what need # to be directly downloaded, they were extracted out of this # archive. The DownloaderJob will re-extract them and set up # the associations for the new ProcessorJob. # So double check that it still needs downloading because # another file that came out of it could have already # recreated the DownloaderJob. if archive_file.needs_downloading(processor_job_id): if archive_file.is_downloaded: # If it needs to be downloaded then it's not # downloaded and the is_downloaded field should stop # lying about that. archive_file.is_downloaded = False archive_file.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=new_job, original_file=archive_file ) else: # We can't just associate the undownloaded files, because # there's a chance that there is a file which actually is # downloaded that also needs to be associated with the job. for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=new_job, original_file=original_file ) return True