def requeue_sample(sample, dry_run=False): sample_was_requeued = False if sample.is_processed: has_live_computational_results = False for result in sample.results.all(): live_files = result.computedfile_set.filter( s3_bucket__isnull=False, s3_key__isnull=False) if live_files.count() >= 1: has_live_computational_results = True live_computed_files = sample.computed_files.filter( s3_bucket__isnull=False, s3_key__isnull=False) if not (has_live_computational_results or live_computed_files.count() > 1): sample_was_requeued = True if not dry_run: # There's no live computed files, the sample # should not have been marked processed. sample.is_processed = False sample.save() create_downloader_job(sample.original_files.all(), force=True) return sample_was_requeued
def update_salmon_versions(experiment: Experiment): quant_results = get_quant_results_for_experiment(experiment, filter_old_versions=False) # We first need to find the last created result and get its salmon version. # get_quant_results_for_experiment returns a set, not a queryset. last_created = None for quant_result in quant_results: if not last_created: last_created = quant_result else: if quant_result.created_at > last_created.created_at: last_created = quant_result latest_salmon_version = last_created.organism_index.salmon_version total_samples_queued = 0 for quant_result in quant_results: if latest_salmon_version != quant_result.organism_index.salmon_version: # we found a quant result associated with an experiment where we need to run salmon # hopefully each computational result is associated with a single sample for sample in quant_result.samples.all(): original_files = list(sample.original_files.all()) if not len(original_files): continue # Ensure that there's no processor jobs for these original files that the foreman # might want to retry (failed | hung | lost) has_open_processor_job = (ProcessorJob.objects.filter( original_files=original_files[0], pipeline_applied=ProcessorPipeline.SALMON).filter( Q(success=False, retried=False, no_retry=False) | Q( success=None, retried=False, no_retry=False, start_time__isnull=False, end_time=None, batch_job_id__isnull=False, ) | Q( success=None, retried=False, no_retry=False, start_time=None, end_time=None, )).exists()) if has_open_processor_job: continue create_downloader_job(original_files, force=True) total_samples_queued += 1 logger.info( "Re-ran Salmon for %d samples in experiment %s.", total_samples_queued, experiment.accession_code, )
def update_salmon_versions(experiment: Experiment): quant_results = (get_quant_results_for_experiment( experiment, filter_old_versions=False).order_by( "-organism_index__created_at").prefetch_related( "organism_index").prefetch_related("samples__original_files")) total_samples_queued = 0 latest_salmon_version = None for quant_result in quant_results: if not latest_salmon_version: # we can safely ignore the latest salmon version, that will be the first # quant result. Note we are ordering by -organism_index__created_at latest_salmon_version = quant_result.organism_index.salmon_version elif latest_salmon_version != quant_result.organism_index.salmon_version: # we found a quant result associated with an experiment where we need to run salmon # hopefully each computational result is associated with a single sample for sample in quant_result.samples.all(): original_files = list(sample.original_files.all()) if not len(original_files): continue # Ensure that there's no processor jobs for these original files that the foreman # might want to retry (failed | hung | lost) has_open_processor_job = (ProcessorJob.objects.filter( original_files=original_files[0], pipeline_applied=ProcessorPipeline.SALMON).filter( Q(success=False, retried=False, no_retry=False) | Q( success=None, retried=False, no_retry=False, start_time__isnull=False, end_time=None, nomad_job_id__isnull=False, ) | Q( success=None, retried=False, no_retry=False, start_time=None, end_time=None, )).exists()) if has_open_processor_job: continue create_downloader_job(original_files, force=True) total_samples_queued += 1 logger.info( "Re-ran Salmon for %d samples in experiment %s.", total_samples_queued, experiment.accession_code, )
def handle(self, *args, **options): """ Requeues downloader jobs for samples that haven't been processed and their original files have no no downloader jobs associated with them """ supported_microarray_platforms = [ x["platform_accession"] for x in get_supported_microarray_platforms() ] supported_rnaseq_platforms = [x.replace(" ", "") for x in get_supported_rnaseq_platforms()] all_supported_platforms = ( supported_microarray_platforms + supported_rnaseq_platforms ) # https://www.postgresql.org/docs/9.1/functions-array.html # Ensure selected samples have valid platforms samples_without_downloader = ( Sample.objects.all() .filter(platform_accession_code__in=all_supported_platforms) .annotate( original_files_count=Count("original_files"), downloader_job_count=Count("original_files__downloader_jobs"), ) .filter(is_processed=False, original_files_count__gt=0, downloader_job_count=0) ) if options.get("created_after", None): samples_without_downloader = samples_without_downloader.filter( created_at__gt=options["created_after"] ) samples_without_downloader = samples_without_downloader.prefetch_related("original_files") logger.info( "Found %d samples without downloader jobs, starting to create them now.", samples_without_downloader.count(), ) paginator = Paginator(samples_without_downloader, PAGE_SIZE) page = paginator.page() while True: for sample in page.object_list: logger.debug("Creating downloader job for a sample.", sample=sample.accession_code) create_downloader_job(sample.original_files.all()) logger.info( "Created %d new downloader jobs because their samples didn't have any.", PAGE_SIZE ) if not page.has_next(): break page = paginator.page(page.next_page_number())
def prepare_original_files(job_context): """ Provision in the Job context for OriginalFile-driven processors """ job = job_context["job"] original_files = job.original_files.all() if original_files.count() == 0: raise ProcessorJobError("No files were found for the job.", success=False) undownloaded_files = set() for original_file in original_files: if original_file.needs_downloading(job_context["job_id"]): if original_file.is_downloaded: # If it needs to be downloaded then it's not # downloaded and the is_downloaded field should stop # lying about that. original_file.is_downloaded = False original_file.save() undownloaded_files.add(original_file) if undownloaded_files: logger.info( ( "One or more files found which were missing or not downloaded." " Creating downloader jobs for them and deleting this job." ), processor_job=job.id, missing_files=list(undownloaded_files), ) was_job_created = create_downloader_job( undownloaded_files, processor_job_id=job_context["job_id"], force=True ) if not was_job_created: raise ProcessorJobError( "Missing file for processor job but unable to recreate downloader jobs!", success=False, ) raise ProcessorJobError( "We can not process the data because it is not on the disk", success=False, no_retry=True, # this job should not be retried again abort=True, # abort the job and don't do anything else undownloaded_files=[file.id for file in undownloaded_files], ) job_context["original_files"] = original_files first_original_file = original_files.first() samples = Sample.objects.filter(original_files=first_original_file) job_context["samples"] = samples job_context["computed_files"] = [] return job_context
def requeue_samples(eligible_samples): paginator = Paginator(eligible_samples, PAGE_SIZE) page = paginator.page() creation_count = 0 while True: for sample in page.object_list: if create_downloader_job(sample.original_files.all(), force=True): creation_count += 1 if not page.has_next(): break else: page = paginator.page(page.next_page_number()) logger.info("Creating new downloader jobs. %d so far", creation_count) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5) return creation_count