def test_verification_failure(self, _extract_file, _download_file, mock_send_job): mock_send_job.return_value = None # Set a different download URL to trigger a failure in the # _verify_batch_grouping function batches, files = self.insert_objects() files[1].download_url = "https://wompwomp.com" files[1].save() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function array_express.download_array_express(downloader_job.id) _download_file.assert_not_called() _extract_file.assert_not_called() mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) self.assertEqual(downloader_job.failure_reason, ("A Batch's file doesn't have the same download " "URL as the other batches' files."))
def handle(self, *args, **options): if options["job_id"] is None: logger.error("You must specify a job ID.") sys.exit(1) try: job_type = Downloaders[options["job_name"]] except KeyError: logger.error("You must specify a valid job name.") sys.exit(1) if job_type is Downloaders.ARRAY_EXPRESS: download_array_express(options["job_id"]) elif job_type is Downloaders.TRANSCRIPTOME_INDEX: download_transcriptome(options["job_id"]) elif job_type is Downloaders.SRA: download_sra(options["job_id"]) elif job_type is Downloaders.GEO: download_geo(options["job_id"]) else: logger.error( ("A valid job name was specified for job %s with id %d but " "no downloader function is known to run it."), options["job_name"], options["job_id"], ) sys.exit(1) sys.exit(0)
def test_download(self, _extract_file, _download_file, _verify_batch_grouping, mock_send_job): mock_send_job.return_value = None batches, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the function we're testing: array_express.download_array_express(downloader_job.id) target_file_path = ( "/home/user/data_store/temp/A-AFFY-1/AFFY_TO_PCL/downloader_job_{}" "/E-GEOD-59071.raw.3.zip").format(str(downloader_job.id)) # Verify that all expected functionality is run: self.assertEqual(_verify_batch_grouping.call_count, 1) download_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/GEOD/E-GEOD-59071/E-GEOD-59071.raw.3.zip" # noqa _download_file.assert_called_with(download_url, target_file_path, downloader_job) args, _ = _extract_file.call_args file_query_set, job = args self.assertEqual(list(file_query_set), files) self.assertEqual(job.id, downloader_job.id) # Verify that the database has been updated correctly: batches = Batch.objects.all() for batch in batches: self.assertEqual(batch.status, BatchStatuses.DOWNLOADED.value) downloader_job = DownloaderJob.objects.get() self.assertTrue(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) processor_jobs = ProcessorJob.objects.all() self.assertEqual(len(processor_jobs), 2) mock_send_job.assert_has_calls([ call(ProcessorPipeline.AFFY_TO_PCL, processor_jobs[0].id), call(ProcessorPipeline.AFFY_TO_PCL, processor_jobs[1].id) ])
def test_extraction_failure(self, _download_file, mock_send_job): mock_send_job.return_value = None batches, files = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function array_express.download_array_express(downloader_job.id) mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) job_dir = utils.JOB_DIR_PREFIX + str(downloader_job.id) zip_path = files[0].get_temp_download_path(job_dir) self.assertEqual(downloader_job.failure_reason, "Exception caught while extracting " + zip_path)
def test_download_failure(self, _extract_file, _open, mock_send_job): # Set up mocks: mock_send_job.return_value = None _open.side_effect = Exception() batches, _ = self.insert_objects() downloader_job = DownloaderJob.create_job_and_relationships( batches=batches) # Call the downloader function array_express.download_array_express(downloader_job.id) _extract_file.assert_not_called() mock_send_job.assert_not_called() # Verify that the database has been updated correctly: downloader_job = DownloaderJob.objects.get() self.assertFalse(downloader_job.success) self.assertIsNotNone(downloader_job.start_time) self.assertIsNotNone(downloader_job.end_time) self.assertEqual(downloader_job.failure_reason, "Exception caught while downloading batch")
def test_download_multiple_zips(self, mock_send_job): """Tests that each sample gets one processor job no matter what. https://github.com/AlexsLemonade/refinebio/pull/351 deals with a bug where every file that was extracted to a directory got a processor job queued for it each time a downloader job ran which pointed to that directory. This test makes sure this bug stays squashed. It does so by running two downloader jobs for the same experiment which use two different zip files. Before this bug was squashed this would have resulted in the first sample getting a second processor job queued for it because the second downloader job would have found the file in the directory. """ dlj1 = DownloaderJob() dlj1.accession_code = 'E-MEXP-433' dlj1.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.1.zip" original_file.source_filename = "Waldhof_020604_R30_01-2753_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj1 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-Waldhof_020604_R30_01-2753_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) dlj2 = DownloaderJob() dlj2.accession_code = 'E-MEXP-433' dlj2.save() original_file = OriginalFile() original_file.source_url = "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MEXP/E-MEXP-433/E-MEXP-433.raw.2.zip" original_file.source_filename = "N08_U133A.CEL" original_file.save() assoc = DownloaderJobOriginalFileAssociation() assoc.original_file = original_file assoc.downloader_job = dlj2 assoc.save() sample = Sample() sample.accession_code = 'E-MEXP-433-N08_U133A' sample.technology = "MICROARRAY" sample.manufacturer = "AFFYMETRIX" sample.has_raw = True # This is fake, but we don't currently support any agilent # platforms so we're using a platform that is supported. sample.platform_accession_code = "hgu133a" sample.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample, original_file=original_file) array_express.download_array_express(dlj1.id) array_express.download_array_express(dlj2.id) self.assertEqual(ProcessorJob.objects.all().count(), 2)