def test_download_file_ncbi(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, 'd5374e7fe047d4f76b165c3f5148ab2df9d42cea') self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def create_downloader_job(): job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.downloader_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file assoc.downloader_job = job assoc.save() return job
def test_download_and_extract_file(self, mock_urlopen): mock_urlopen.side_effect = file_caching_urlopen dlj = DownloaderJob() dlj.save() array_express._download_file( "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip", "dlme.zip", dlj, ) files = array_express._extract_files("dlme.zip", "123", dlj) # Test that all files were correctly extracted filenames = [file["filename"] for file in files] EXPECTED_FILES = [ "GSM1426089_controle_colon_86.CEL", "GSM1426088_controle_colon_85.CEL", "GSM1426087_controle_colon_84.CEL", "GSM1426086_controle_colon_83.CEL", "GSM1426085_controle_colon_82.CEL", "GSM1426084_controle_colon_81.CEL", "GSM1426083_controle_colon_80.CEL", "GSM1426082_controle_colon_79.CEL", "GSM1426081_controle_colon_78.CEL", "GSM1426080_controle_colon_77.CEL", "GSM1426079_controle_colon_76.CEL", "GSM1426078_CD_colon_active_8.CEL", "GSM1426077_CD_colon_active_7.CEL", "GSM1426076_CD_colon_active_6.CEL", "GSM1426075_CD_colon_active_5.CEL", "GSM1426074_CD_colon_active_4.CEL", "GSM1426073_CD_colon_active_3.CEL", "GSM1426072_CD_colon_active_2.CEL", "GSM1426071_CD_colon_active_1.CEL", ] self.assertEqual(sorted(filenames), sorted(EXPECTED_FILES))
def create_downloader_job(self): job = DownloaderJob( downloader_task="SRA", nomad_job_id="DOWNLOADER/dispatch-1528945054-e8eaf540", num_retries=0, accession_code="NUNYA", success=None) job.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file assoc1.downloader_job = job assoc1.save() og_file = OriginalFile() og_file.source_filename = "doesn't matter" og_file.filename = "this either" og_file.absolute_file_path = "nor this" og_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file assoc.downloader_job = job assoc.save() return job
def test_download_file_swapper(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "DRR002116" dlj.save() og = OriginalFile() og.source_filename = "DRR002116.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/DRR/DRR002/DRR002116/DRR002116.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'DRR002116' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp", force_ftp=False) self.assertTrue(result)
def test_download_and_extract_file(self, mock_send_job): dlj = DownloaderJob() dlj.save() array_express._download_file( 'ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip', 'dlme.zip', dlj) files = array_express._extract_files('dlme.zip', '123', dlj)
def test_download_file_ncbi(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "e7ad484fe6f134ba7d1b2664e58cc15ae5a958cc") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_aspera_downloader(self): """ """ batch = Batch(survey_job=self.survey_job, source_type="SRA", pipeline_required="SALMON", platform_accession_code="IlluminaHiSeq2000", experiment_accession_code="DRX001563", experiment_title="It doesn't really matter.", organism_id=9031, organism_name="GALLUS GALLUS", release_date="2013-07-19", last_uploaded_date="2017-09-11", status=BatchStatuses.NEW.value) batch.save() # This is converted from FTP URL to use Aspera file = File( size_in_bytes=0, download_url= "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz", # noqa raw_format="fastq.gz", processed_format="tar.gz", name="ERR036000_1.fastq.gz", internal_location="IlluminaHiSeq2000/SALMON", batch=batch) dj = DownloaderJob() self.assertTrue(sra._download_file(file, dj, file.name))
def test_download_file(self): dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = "ERR036000" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "1dfe5460a4101fe87feeffec0cb2e053f6695961") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.accession_code = "ERR036" dlj.save() og = OriginalFile() og.source_filename = "ERR036000.fastq.gz" og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" og.is_archive = True og.save() sample = Sample() sample.accession_code = 'ERR036000' sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() success = sra.download_sra(dlj.pk)
def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False, ): """Creates a single DownloaderJob with multiple files to download. """ # Transcriptome is a special case because there's no sample_object. # It's alright to re-process transcriptome indices. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls).count() if old_assocs_count > 0: logger.debug( "We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info( "No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id, ) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info( "Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls, ) message_queue.send_job(downloader_task, downloader_job) except: # If we fail to queue the job, it will be requeued. pass
def queue_downloader_job_for_original_files( self, original_files: List[OriginalFile], experiment_accession_code: str = None, is_transcriptome: bool = False): """Creates a single DownloaderJob with multiple files to download. """ source_urls = [ original_file.source_url for original_file in original_files ] # There is already a downloader job associated with this file. old_assocs = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url__in=source_urls) if len(old_assocs) > 0: logger.debug("We found an existing DownloaderJob for these urls.", source_urls=source_urls) return False # Transcriptome is a special case because there's no sample_object. if is_transcriptome: downloader_task = job_lookup.Downloaders.TRANSCRIPTOME_INDEX else: sample_object = original_files[0].samples.first() downloader_task = job_lookup.determine_downloader_task( sample_object) if downloader_task == job_lookup.Downloaders.NONE: logger.info("No valid downloader task found for sample.", sample=sample_object.id, original_file=original_files[0].id) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment_accession_code downloader_job.save() downloaded_urls = [] for original_file in original_files: DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) downloaded_urls.append(original_file.source_url) try: logger.info("Queuing downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, downloaded_urls=downloaded_urls) message_queue.send_job(downloader_task, downloader_job) except Exception as e: # If the task doesn't get sent we don't want the # downloader_job to be left floating logger.exception("Failed to enqueue downloader job.", survey_job=self.survey_job.id, downloader_job=downloader_job.id, error=str(e)) downloader_job.success = False downloader_job.failure_reason = str(e) downloader_job.save()
def create_processor_job(pipeline="AFFY_TO_PCL", ram_amount=2048, start_time=None): og_file_1 = OriginalFile() og_file_1.source_filename = "doesn't matter" og_file_1.filename = "this either" og_file_1.absolute_file_path = "nor this" og_file_1.save() og_file_2 = OriginalFile() og_file_2.source_filename = "doesn't matter" og_file_2.filename = "this either" og_file_2.absolute_file_path = "nor this" og_file_2.save() downloader_job = None if pipeline == "AFFY_TO_PCL": downloader_job = DownloaderJob( downloader_task="SRA", batch_job_id="DEFAULT", num_retries=0, accession_code="NUNYA", success=None, ) downloader_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.downloader_job = downloader_job assoc.save() assoc1 = DownloaderJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.downloader_job = downloader_job assoc1.save() processor_job = ProcessorJob( downloader_job=downloader_job, pipeline_applied=pipeline, batch_job_id="PROCESSOR/dispatch-1528945054-e8eaf540", ram_amount=ram_amount, num_retries=0, success=None, start_time=start_time, ) processor_job.save() assoc1 = ProcessorJobOriginalFileAssociation() assoc1.original_file = og_file_1 assoc1.processor_job = processor_job assoc1.save() assoc = ProcessorJobOriginalFileAssociation() assoc.original_file = og_file_2 assoc.processor_job = processor_job assoc.save() return processor_job
def test_jobs_sanity(self): """Just makes sure creating Jobs doesn't fail""" s_job = SurveyJob() s_job.save() processor_job = ProcessorJob() processor_job.pipeline_applied = "test0" processor_job.save() dl_job = DownloaderJob() dl_job.downloader_task = "XYZ" dl_job.accession_code = "123" dl_job.save()
def test_download_file(self, mock_send_job): mock_send_job.return_value = None dlj = DownloaderJob() dlj.save() og = OriginalFile() og.source_filename = "Aegilops_tauschii.ASM34733v1.37.gtf.gz" og.source_url = self.gtf_download_url og.is_archive = True og.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() transcriptome_index.download_transcriptome(dlj.pk)
def test_download_file_unmated_reads(self): dlj = DownloaderJob() dlj.accession_code = "SRR1603661" dlj.save() og_1 = OriginalFile() og_1.source_filename = "SRR1603661_1.fastq.gz" og_1.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_1.fastq.gz" og_1.expected_md5 = "502a9a482bfa5aa75865ccc0105ad13c" og_1.expected_size_in_bytes = 6751980628 og_1.is_archive = True og_1.save() og_2 = OriginalFile() og_2.source_filename = "SRR1603661_2.fastq.gz" og_2.source_url = "ftp.sra.ebi.ac.uk/vol1/fastq/SRR160/001/SRR1603661/SRR1603661_2.fastq.gz" og_1.expected_md5 = "fffd24457418d255991f54ec82a39d57" og_1.expected_size_in_bytes = 6949912932 og_2.is_archive = True og_2.save() sample = Sample() sample.accession_code = "SRR1603661" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_1 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_1 assoc.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og_2 assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og_2 assoc.save() result, downloaded_files = sra.download_sra(dlj.pk) utils.end_downloader_job(dlj, result) self.assertTrue(result) self.assertEqual(downloaded_files[0].sha1, "52bf22472069d04fa7767429f6ab78ebd10c0152") self.assertTrue(os.path.exists(downloaded_files[0].absolute_file_path))
def test_no_rnaseq(self): """Makes sure that no RNA-Seq data gets downloaded even if there's a job for it. """ dlj = DownloaderJob() dlj.accession_code = 'GSE103217' dlj.save() original_file = OriginalFile() original_file.filename = "GSE103217_family.xml.tgz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE103nnn/GSE103217/miniml/GSE103217_family.xml.tgz" original_file.source_filename = "GSE103217_family.xml.tgz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE103217' sample.technology = "RNA-SEQ" sample.manufacturer = "ILLUMINA" sample.platform_accession_code = "Illumina HiSeq 2500" sample.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) self.assertFalse(download_result) dlj.refresh_from_db() self.assertFalse(dlj.success) # It's not necessarily that we didn't extract any files, but # none that were usable so it looks like none. self.assertEqual(dlj.failure_reason, "Failed to extract any downloaded files.")
def test_download_file_swapper(self): dlj = DownloaderJob() dlj.accession_code = "SRR9117853" dlj.save() og = OriginalFile() og.source_filename = "SRR9117853.sra" og.source_url = "[email protected]:/sra/sra-instant/reads/ByRun/sra/SRR/SRR9117/SRR9117853/SRR9117853.sra" og.is_archive = True og.save() sample = Sample() sample.accession_code = "SRR9117853" sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = sample assoc.original_file = og assoc.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = dlj assoc.original_file = og assoc.save() result = sra._download_file(og.source_url, dlj, "/tmp/doomed", force_ftp=False) self.assertTrue(result)
def requeue_downloader_job(last_job: DownloaderJob) -> None: """Queues a new downloader job. The new downloader job will have num_retries one greater than last_job.num_retries. """ num_retries = last_job.num_retries + 1 new_job = DownloaderJob(num_retries=num_retries, downloader_task=last_job.downloader_task, accession_code=last_job.accession_code) new_job.save() for original_file in last_job.original_files.all(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=new_job, original_file=original_file) logger.debug( "Requeuing Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id) try: if send_job(Downloaders[last_job.downloader_task], job=new_job, is_dispatch=True): last_job.retried = True last_job.success = False last_job.retried_job = new_job last_job.save() else: # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete() except: logger.error( "Failed to requeue Downloader Job which had ID %d with a new Downloader Job with ID %d.", last_job.id, new_job.id) # Can't communicate with nomad just now, leave the job for a later loop. new_job.delete()
def test_download_multiple_zips(self, mock_send_job): """Tests that each sample gets one processor job no matter what. https://github.com/AlexsLemonade/refinebio/pull/351 deals with a bug where every file that was extracted to a directory got a processor job queued for it each time a downloader job ran which pointed to that directory. This test makes sure this bug stays squashed. It does so by running two downloader jobs for the same experiment which use two different zip files. Before this bug was squashed this would have resulted in the first sample getting a second processor job queued for it because the second downloader job would have found the file in the directory. """ dlj1 = DownloaderJob() dlj1.accession_code = 'E-MEXP-433' dlj1.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj1 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-Waldhof_020604_R30_01-2753_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) dlj2 = DownloaderJob() dlj2.accession_code = 'E-MEXP-433' dlj2.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.2.zip" original_file.source_filename = "N08_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj2 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-N08_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) array_express.download_array_express(dlj1.id) array_express.download_array_express(dlj2.id) self.assertEqual(ProcessorJob.objects.all().count(), 2)
def test_dharma(self): dlj1 = DownloaderJob() dlj1.accession_code = 'D1' dlj1.worker_id = get_instance_id() dlj1.start_time = datetime.datetime.now() dlj1.save() dlj2 = DownloaderJob() dlj2.accession_code = 'D2' dlj2.worker_id = get_instance_id() dlj2.start_time = datetime.datetime.now() dlj2.save() dlj3 = DownloaderJob() dlj3.accession_code = 'D3' dlj3.worker_id = get_instance_id() dlj3.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj3 assoc.save() sample = Sample() sample.accession_code = 'Blahblahblah' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=2, force_harakiri=True) except SystemExit as e: # This is supposed to happen! self.assertTrue(True) exited = True except Exception as e: # This isn't! self.assertTrue(False) self.assertTrue(exited) exited = False try: utils.start_job(dlj3.id, max_downloader_jobs_per_node=15, force_harakiri=True) except SystemExit as e: # This is not supposed to happen! self.assertTrue(False) exited = True except Exception as e: # This is! self.assertTrue(True) self.assertFalse(exited)
def test_download_geo(self, mock_send_task): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.filename = "GSE22427_non-normalized.txt.gz" original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.technology = "MICROARRAY" sample.manufacturer = "AGILENT" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "Illumina_RatRef-12_V1.0" sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() download_result = geo.download_geo(dlj.id) file_assocs = OriginalFileSampleAssociation.objects.filter( sample=sample) self.assertEqual(file_assocs.count(), 2) for file_assoc in file_assocs: original_file = file_assoc.original_file if original_file.filename.endswith(".gz"): # We delete the archive after we extract from it self.assertFalse(original_file.is_downloaded) else: self.assertTrue(original_file.is_downloaded) # Make sure it worked self.assertTrue(download_result) self.assertTrue(dlj.failure_reason is None) self.assertTrue(len(ProcessorJob.objects.all()) > 0) self.assertEqual(ProcessorJob.objects.all()[0].pipeline_applied, "AGILENT_TWOCOLOR_TO_PCL") self.assertEqual(ProcessorJob.objects.all()[0].ram_amount, 2048)
def test_download_and_extract_file(self): # Download function requires a DownloaderJob object, # can be blank for the simple case. dlj = DownloaderJob() dlj.save() # *_family.xml.tgz geo._download_file( 'ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10241/miniml/GSE10241_family.xml.tgz', '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz', dlj) files = geo._extract_tgz( '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz', 'GSE10241') self.assertEqual(8, len(files)) # GPL File self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GPL6102-tbl-1.txt')) # GSM Files self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSM258515-tbl-1.txt')) self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSM258516-tbl-1.txt')) self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSM258530-tbl-1.txt')) self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSM258517-tbl-1.txt')) # Original family file self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSE10241_family.xml')) self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz')) self.assertTrue( os.path.isfile( '/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tar')) # .txt.gz geo._download_file( 'ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254828/suppl/GSM254828.txt.gz', '/home/user/data_store/GSM254828/raw/GSM254828.txt.gz', dlj) files = geo._extract_gz( '/home/user/data_store/GSM254828/raw/GSM254828.txt.gz', 'GSM254828') self.assertEqual(1, len(files)) self.assertTrue( os.path.isfile( '/home/user/data_store/GSM254828/raw/GSM254828.txt')) geo._download_file( "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz", '/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz', dlj) files = geo._extract_gz( '/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz', 'GSE22427') self.assertEqual(1, len(files)) self.assertTrue( os.path.isfile( '/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt' ))
def test_organism_shepherd_command(self, mock_nomad, mock_send_job, mock_get_active_volumes): """Tests that the organism shepherd requeues jobs in the right order. The situation we're setting up is basically this: * There are two experiments. * One of them has 1/2 samples processed, the other 0/1 * One of them needs a DownloaderJob requeued and the other needs a ProcessorJob requued. And what we're going to test for is: * Both of the jobs that need to be requeued are requeued. * The experiment with a processed sample is requeued first because it has a higher completion percentage. """ # First, set up our mocks to prevent network calls. mock_send_job.return_value = True active_volumes = {"1", "2", "3"} mock_get_active_volumes.return_value = active_volumes def mock_init_nomad(host, port=0, timeout=0): ret_value = MagicMock() ret_value.jobs = MagicMock() ret_value.jobs.get_jobs = MagicMock() ret_value.jobs.get_jobs.side_effect = lambda: [] return ret_value mock_nomad.side_effect = mock_init_nomad zebrafish = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) zebrafish.save() # Experiment that is 0% complete. zero_percent_experiment = Experiment(accession_code='ERP037000') zero_percent_experiment.technology = 'RNA-SEQ' zero_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=zero_percent_experiment) zero_percent = OriginalFile() zero_percent.filename = "ERR037001.fastq.gz" zero_percent.source_filename = "ERR037001.fastq.gz" zero_percent.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR037/ERR037001/ERR037001_1.fastq.gz" zero_percent.is_archive = True zero_percent.save() zero_percent_sample = Sample() zero_percent_sample.accession_code = 'ERR037001' zero_percent_sample.organism = zebrafish zero_percent_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = zero_percent_sample assoc.original_file = zero_percent assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = zero_percent_sample assoc.experiment = zero_percent_experiment assoc.save() # TODO: fix names of all the variables to be appropriate for this test case. zero_percent_dl_job = DownloaderJob() zero_percent_dl_job.accession_code = zero_percent_sample.accession_code zero_percent_dl_job.downloader_task = "SRA" zero_percent_dl_job.start_time = timezone.now() zero_percent_dl_job.end_time = timezone.now() zero_percent_dl_job.success = False zero_percent_dl_job.save() assoc = DownloaderJobOriginalFileAssociation() assoc.downloader_job = zero_percent_dl_job assoc.original_file = zero_percent assoc.save() # Experiment that is 50% complete. fify_percent_experiment = Experiment(accession_code='ERP036000') fify_percent_experiment.technology = 'RNA-SEQ' fify_percent_experiment.save() organism_assoc = ExperimentOrganismAssociation.objects.create( organism=zebrafish, experiment=fify_percent_experiment) ## First sample, this one has been processed. successful_pj = ProcessorJob() successful_pj.accession_code = "ERR036000" successful_pj.pipeline_applied = "SALMON" successful_pj.ram_amount = 12288 successful_pj.start_time = timezone.now() successful_pj.end_time = timezone.now() successful_pj.success = True successful_pj.save() successful_og = OriginalFile() successful_og.filename = "ERR036000.fastq.gz" successful_og.source_filename = "ERR036000.fastq.gz" successful_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036000/ERR036000_1.fastq.gz" successful_og.is_archive = True successful_og.save() successful_sample = Sample() successful_sample.accession_code = 'ERR036000' successful_sample.organism = zebrafish successful_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = successful_sample assoc.original_file = successful_og assoc.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = successful_pj assoc.original_file = successful_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = successful_sample assoc.experiment = fify_percent_experiment assoc.save() ## Second sample, this one hasn't been processed. fifty_percent_unprocessed_og = OriginalFile() fifty_percent_unprocessed_og.filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_filename = "ERR036001.fastq.gz" fifty_percent_unprocessed_og.source_url = "ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR036/ERR036001/ERR036001_1.fastq.gz" fifty_percent_unprocessed_og.is_archive = True fifty_percent_unprocessed_og.save() fifty_percent_unprocessed_sample = Sample() fifty_percent_unprocessed_sample.accession_code = 'ERR036001' fifty_percent_unprocessed_sample.organism = zebrafish fifty_percent_unprocessed_sample.save() assoc = OriginalFileSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.original_file = fifty_percent_unprocessed_og assoc.save() assoc = ExperimentSampleAssociation() assoc.sample = fifty_percent_unprocessed_sample assoc.experiment = fify_percent_experiment assoc.save() fifty_percent_processor_job = ProcessorJob() fifty_percent_processor_job.pipeline_applied = "SALMON" fifty_percent_processor_job.accession_code = fifty_percent_unprocessed_sample.accession_code fifty_percent_processor_job.ram_amount = 12288 fifty_percent_processor_job.start_time = timezone.now() fifty_percent_processor_job.end_time = timezone.now() fifty_percent_processor_job.success = False fifty_percent_processor_job.save() assoc = ProcessorJobOriginalFileAssociation() assoc.processor_job = fifty_percent_processor_job assoc.original_file = fifty_percent_unprocessed_og assoc.save() # Setup is done, actually run the command. args = [] options = {"organism_name": "DANIO_RERIO"} call_command("organism_shepherd", *args, **options) # Verify that the jobs were called in the correct order. mock_calls = mock_send_job.mock_calls first_call_job_type = mock_calls[0][1][0] first_call_job_object = mock_calls[0][2]["job"] self.assertEqual(first_call_job_type, ProcessorPipeline.SALMON) self.assertEqual(first_call_job_object.pipeline_applied, fifty_percent_processor_job.pipeline_applied) self.assertEqual(first_call_job_object.ram_amount, fifty_percent_processor_job.ram_amount) self.assertIn(first_call_job_object.volume_index, active_volumes) fifty_percent_processor_job.refresh_from_db() self.assertEqual(first_call_job_object, fifty_percent_processor_job.retried_job) second_call_job_type = mock_calls[1][1][0] second_call_job_object = mock_calls[1][2]["job"] self.assertEqual(second_call_job_type, Downloaders.SRA) self.assertEqual(second_call_job_object.accession_code, zero_percent_dl_job.accession_code) self.assertEqual(second_call_job_object.downloader_task, zero_percent_dl_job.downloader_task) zero_percent_dl_job.refresh_from_db() self.assertEqual(second_call_job_object, zero_percent_dl_job.retried_job)
def test_download_aspera_and_ftp(self): """ Tests the main 'download_geo' function. """ dlj = DownloaderJob() dlj.accession_code = 'GSE22427' dlj.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz" original_file.source_filename = "GSE22427_non-normalized.txt.gz" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj assoc.save() sample = Sample() sample.accession_code = 'GSE22427' sample.save() sample_annotation = SampleAnnotation() sample_annotation.sample = sample sample_annotation.data = { 'label_protocol_ch1': 'Agilent', 'label_protocol_ch2': 'Agilent' } sample_annotation.save() og_assoc = OriginalFileSampleAssociation() og_assoc.sample = sample og_assoc.original_file = original_file og_assoc.save() LOCAL_ROOT_DIR = "/home/user/data_store" os.makedirs(LOCAL_ROOT_DIR + '/' + sample.accession_code, exist_ok=True) dl_file_path = LOCAL_ROOT_DIR + '/' + sample.accession_code + '/' + original_file.source_url.split( '/')[-1] # Aspera result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=False) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # FTP result = geo._download_file(original_file.source_url, file_path=dl_file_path, job=dlj, force_ftp=True) self.assertTrue(result) self.assertTrue(os.path.exists(dl_file_path)) os.remove(dl_file_path) # Aspera, fail result = geo._download_file_aspera("https://rich.zone/cool_horse.jpg", target_file_path=dl_file_path, downloader_job=dlj, attempt=5) self.assertFalse(result) self.assertTrue(dlj.failure_reason != None)
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def queue_downloader_jobs(self, experiment: Experiment, samples: List[Sample]): """This enqueues DownloaderJobs on a per-file basis. There is a complementary function below for enqueueing multi-file DownloaderJobs. """ files_to_download = [] for sample in samples: files_for_sample = OriginalFile.objects.filter(sample=sample, is_downloaded=False) for og_file in files_for_sample: files_to_download.append(og_file) download_urls_with_jobs = {} for original_file in files_to_download: # We don't need to create multiple downloaders for the same file. # However, we do want to associate original_files with the # DownloaderJobs that will download them. if original_file.source_url in download_urls_with_jobs.keys(): DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=download_urls_with_jobs[ original_file.source_url], original_file=original_file, ) continue # There is already a downloader job associated with this file. old_assocs_count = DownloaderJobOriginalFileAssociation.objects.filter( original_file__source_url=original_file.source_url).count() if old_assocs_count > 0: logger.debug( "We found an existing DownloaderJob for this file/url.", original_file_id=original_file.id, ) continue sample_object = original_file.samples.first() downloader_task = determine_downloader_task(sample_object) if downloader_task == Downloaders.NONE: logger.info( "No valid downloader task found for sample.", sample=sample_object.id, original_file=original_file.id, ) else: downloader_job = DownloaderJob() downloader_job.downloader_task = downloader_task.value downloader_job.accession_code = experiment.accession_code downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file) download_urls_with_jobs[ original_file.source_url] = downloader_job try: logger.info( "Queuing downloader job for URL: " + original_file.source_url, survey_job=self.survey_job.id, downloader_job=downloader_job.id, ) send_job(downloader_task, downloader_job) except Exception: # If we fail to queue the job, it will be requeued. pass
def test_download_and_extract_file(self): # Download function requires a DownloaderJob object, # can be blank for the simple case. dlj = DownloaderJob() dlj.save() # *_family.xml.tgz geo._download_file( "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE10nnn/GSE10241/miniml/GSE10241_family.xml.tgz", "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz", dlj, ) archive_file = geo.ArchivedFile( "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz") files = [file for file in archive_file.get_files()] # There should be 8 files in total in the directory, 2 downloaded and 6 extracted # `archive_file.get_files()` only returns the files that are extracted from the archives # instead of enumerating over all files. self.assertEqual(6, len(files)) # GPL File self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GPL6102-tbl-1.txt")) # GSM Files self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSM258515-tbl-1.txt")) self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSM258516-tbl-1.txt")) self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSM258530-tbl-1.txt")) self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSM258517-tbl-1.txt")) # Original family file self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSE10241_family.xml")) self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tgz")) self.assertTrue( os.path.isfile( "/home/user/data_store/GSE10241/raw/GSE10241_family.xml.tar")) # .txt.gz geo._download_file( "ftp://ftp.ncbi.nlm.nih.gov/geo/samples/GSM254nnn/GSM254828/suppl/GSM254828.txt.gz", "/home/user/data_store/GSM254828/raw/GSM254828.txt.gz", dlj, ) archive_file = geo.ArchivedFile( "/home/user/data_store/GSM254828/raw/GSM254828.txt.gz") files = [file for file in archive_file.get_files()] self.assertEqual(1, len(files)) self.assertTrue( os.path.isfile( "/home/user/data_store/GSM254828/raw/GSM254828.txt")) geo._download_file( "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE22nnn/GSE22427/suppl/GSE22427_non-normalized.txt.gz", "/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz", dlj, ) archive_file = geo.ArchivedFile( "/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt.gz" ) files = [file for file in archive_file.get_files()] self.assertEqual(1, len(files)) self.assertTrue( os.path.isfile( "/home/user/data_store/GSE22427/raw/GSE22427_non-normalized.txt" ))
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def test_create_missing_jobs(self): """Tests that files which should have downloader jobs get them created.""" # 1. create a sample with an original file and a downloader job original_file_with_downloader = OriginalFile() original_file_with_downloader.filename = "processed.CEL" original_file_with_downloader.source_filename = "processed.CEL" original_file_with_downloader.is_downloaded = True original_file_with_downloader.is_archive = False original_file_with_downloader.save() sample_with_downloader = Sample() sample_with_downloader.accession_code = "MA_doesnt_need_processor" sample_with_downloader.technology = "MICROARRAY" sample_with_downloader.source_database = "GEO" sample_with_downloader.platform_accession_code = "bovine" sample_with_downloader.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_with_downloader, original_file=original_file_with_downloader) downloader_job = DownloaderJob() downloader_job.success = True downloader_job.worker_id = "worker_1" downloader_job.volume_index = "1" downloader_job.save() DownloaderJobOriginalFileAssociation.objects.get_or_create( downloader_job=downloader_job, original_file=original_file_with_downloader) # 2. create a sample with an original file and no downloader job original_file = OriginalFile() original_file.filename = "tarball.gz" original_file.source_filename = "tarball.gz" original_file.is_downloaded = True original_file.is_archive = True original_file.save() sample_no_downloader = Sample() sample_no_downloader.accession_code = "sample_no_downloader" sample_no_downloader.technology = "MICROARRAY" sample_no_downloader.source_database = "GEO" sample_no_downloader.platform_accession_code = "bovine" # must be a supported platform sample_no_downloader.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_no_downloader, original_file=original_file) # 3. Setup is done, actually run the command. command = Command() command.handle() ## Test that a missing downloader job was created. self.assertEqual( 1, DownloaderJobOriginalFileAssociation.objects.filter( original_file=original_file).count(), ) ## Test that a downloader job that wasn't missing wasn't created. ## Of course, we created one in test setup, so we're really ## checking that it's still only 1. self.assertEqual( 1, DownloaderJobOriginalFileAssociation.objects.filter( original_file=original_file_with_downloader).count(), )