def test_survey(self, mock_send_task): """A Simple test of the ArrayExpress surveyor.""" survey_job = self.create_job_for_accession("E-MTAB-3050") ae_surveyor = ArrayExpressSurveyor(survey_job) ae_surveyor.survey() samples = Sample.objects.all() downloader_jobs = DownloaderJob.objects.all() # We are expecting this to discover 5 samples. self.assertEqual(samples.count(), 5) # And for one DownloaderJob to be created for all of them. self.assertEqual(downloader_jobs.count(), 1) sample = Sample.objects.first() self.assertTrue(' (hgu95av2)' in sample.pretty_platform) # Confirm the sample's protocol_info self.assertEqual(len(sample.protocol_info), 9) self.assertEqual(sample.protocol_info[0]['Accession'], "P-MTAB-41854") self.assertEqual(sample.protocol_info[0]['Text'], "Aliquoting of biomaterials.") self.assertEqual(sample.protocol_info[0]['Type'], "split") survey_job2 = self.create_job_for_accession("E-GEOD-44719") ae_surveyor = ArrayExpressSurveyor(survey_job2) ae_surveyor.survey() # We are expecting this to discover 77 samples. self.assertEqual(samples.count(), 77+5) # And for one DownloaderJob to be created for all of them. self.assertEqual(downloader_jobs.count(), 2)
def test_survey(self): survey_job = SurveyJob(source_type="ARRAY_EXPRESS") surveyor = ArrayExpressSurveyor(survey_job) file1 = File(download_url="a") file2 = File(download_url="a") file3 = File(download_url="b") file4 = File(download_url="a") batch1 = Batch(files=[file1]) batch2 = Batch(files=[file2]) batch3 = Batch(files=[file3, file4]) surveyor.batches = [batch1, batch2, batch3] groups = surveyor.group_batches() self.assertEqual(groups, [[batch1, batch2], [batch3]])
def test_experiment_object(self, mock_get): """The get_experiment_metadata function extracts all experiment metadata from the experiments API.""" mock_get.return_value = Mock(ok=True) mock_get.return_value.json.return_value = json.loads(EXPERIMENTS_JSON) ae_surveyor = ArrayExpressSurveyor(self.survey_job) experiment = ae_surveyor.get_experiment_metadata("E-MTAB-3050") self.assertEqual("Microarray analysis of in vitro differentiation", experiment["name"]) self.assertEqual("E-MTAB-3050", experiment["experiment_accession_code"]) self.assertEqual("A-AFFY-1", experiment["platform_accession_code"]) self.assertEqual("2014-10-31", experiment["release_date"]) self.assertEqual("2014-10-30", experiment["last_update_date"])
def test_survey_with_protocol_list(self): """Tests an edge case that came up after months: https://github.com/AlexsLemonade/refinebio/issues/761 """ survey_job = self.create_job_for_accession("E-MEXP-2381") ae_surveyor = ArrayExpressSurveyor(survey_job) ae_surveyor.survey() samples = Sample.objects.all() downloader_jobs = DownloaderJob.objects.all() # We are expecting this to discover 2 samples. self.assertEqual(samples.count(), 2) # And for one DownloaderJob to be created for all of them. self.assertEqual(downloader_jobs.count(), 1)
def test_survey(self, mock_send_task, mock_get): """The 'survey' function generates one Batch per sample. This test also tests the handle_batches method of ExternalSourceSurveyor which isn't tested on its own because it is an abstract class. """ mock_send_task.return_value = Mock(ok=True) mock_get.side_effect = mocked_requests_get ae_surveyor = ArrayExpressSurveyor(self.survey_job) ae_surveyor.survey() downloader_jobs = DownloaderJob.objects.all() mock_send_task.assert_has_calls( [call(Downloaders.ARRAY_EXPRESS, downloader_jobs[0].id)]) batches = Batch.objects.all() self.assertEqual(2, len(batches)) self.assertEqual(1, len(downloader_jobs)) batch = batches[0] self.assertEqual(batch.survey_job.id, self.survey_job.id) self.assertEqual(batch.source_type, "ARRAY_EXPRESS") self.assertEqual(batch.pipeline_required, "AFFY_TO_PCL") self.assertEqual(batch.platform_accession_code, "A-AFFY-1") self.assertEqual(batch.experiment_accession_code, "E-MTAB-3050") self.assertEqual(batch.experiment_title, "Microarray analysis of in vitro differentiation") self.assertEqual(batch.status, "NEW") self.assertEqual(batch.release_date, datetime.date(2014, 10, 31)) self.assertEqual(batch.last_uploaded_date, datetime.date(2014, 10, 30)) self.assertEqual(batch.organism_id, 9606) self.assertEqual(batch.organism_name, "H**O SAPIENS") file = batch.files[0] self.assertEqual(file.size_in_bytes, -1) self.assertEqual( file.download_url, "ftp://ftp.ebi.ac.uk/pub/databases/microarray/data/experiment/MTAB/E-MTAB-3050/E-MTAB-3050.raw.1.zip" ) # noqa self.assertEqual(file.raw_format, "CEL") self.assertEqual(file.processed_format, "PCL") self.assertEqual(file.name, "C30057.CEL") self.assertEqual(file.internal_location, "A-AFFY-1/AFFY_TO_PCL")
def _get_surveyor_for_source(survey_job: SurveyJob): """Factory method for ExternalSourceSurveyors.""" if survey_job.source_type == "ARRAY_EXPRESS": return ArrayExpressSurveyor(survey_job) if survey_job.source_type == "SRA": return SraSurveyor(survey_job) if survey_job.source_type == "TRANSCRIPTOME_INDEX": return TranscriptomeIndexSurveyor(survey_job) else: raise SourceNotSupportedError("Source " + survey_job.source_type + " is not supported.")
def test_survey(self, mock_send_task): """A Simple test of the ArrayExpress surveyor.""" survey_job = self.create_job_for_accession("E-MTAB-3050") ae_surveyor = ArrayExpressSurveyor(survey_job) ae_surveyor.survey() samples = Sample.objects.all() downloader_jobs = DownloaderJob.objects.all() # We are expecting this to discover 5 samples. self.assertEqual(samples.count(), 5) # And for one DownloaderJob to be created for all of them. self.assertEqual(downloader_jobs.count(), 1) experiment = Experiment.objects.first() self.assertEqual(experiment.accession_code, "E-MTAB-3050") self.assertEqual(experiment.source_first_published, datetime.datetime(2014, 10, 31, tzinfo=timezone.utc)) self.assertEqual(experiment.source_last_modified, datetime.datetime(2014, 10, 30, tzinfo=timezone.utc)) sample = Sample.objects.first() self.assertTrue(" (hgu95av2)" in sample.pretty_platform) # Confirm the sample's protocol_info self.assertEqual(len(sample.protocol_info), 9) self.assertEqual(sample.protocol_info[0]["Accession"], "P-MTAB-41854") self.assertEqual(sample.protocol_info[0]["Text"], "Aliquoting of biomaterials.") self.assertEqual(sample.protocol_info[0]["Type"], "split") survey_job2 = self.create_job_for_accession("E-GEOD-44719") ae_surveyor = ArrayExpressSurveyor(survey_job2) ae_surveyor.survey() # We are expecting this to discover 77 samples. self.assertEqual(samples.count(), 77 + 5) # And for one DownloaderJob to be created for all of them. self.assertEqual(downloader_jobs.count(), 2)
def test_determine_accession(self): """Test of the `determine_sample_accession` function """ survey_job = self.create_job_for_accession("E-MTAB-3050") ae_surveyor = ArrayExpressSurveyor(survey_job) EXPERIMENTS_URL = "https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/" SAMPLES_URL = EXPERIMENTS_URL + "{}/samples" ex_accessions = [ "E-MTAB-3050", "E-MEXP-669", "E-MEXP-2215", "E-MEXP-2288", "E-MEXP-2381", "E-MTAB-6739", ] for ex_accession in ex_accessions: samples_endpoint = SAMPLES_URL.format(ex_accession) r = requests.get(samples_endpoint, timeout=60) samples = r.json()["experiment"]["sample"] # An experiment can have many samples for sample in samples: # For some reason, this sample has no files associated with it. if "file" not in sample or len(sample['file']) == 0: continue # The accession code is not a simple matter to determine. sample_source_name = sample["source"].get("name", "") sample_assay_name = sample["assay"].get("name", "") has_raw = False for sub_file in sample['file']: # For ex: E-GEOD-15645 if isinstance(sub_file['comment'], list): sub_file_mod = sub_file sub_file_mod['comment'] = sub_file['comment'][0] else: sub_file_mod = sub_file if sub_file_mod['type'] == "data" and sub_file_mod['comment'].get('value', None) != None: has_raw = True if 'raw' in sub_file_mod['comment'].get('value', ''): has_raw = True # Skip derived data if we have it raw. if has_raw and "derived data" in sub_file['type']: continue elif (not has_raw) and "derived data" not in sub_file['type']: # If there is a platform warning then we don't want raw data. has_raw = False continue filename = sub_file["name"] sample_accession_code = ae_surveyor.determine_sample_accession( ex_accession, sample_source_name, sample_assay_name, filename) self.assertTrue(sample_accession_code is not None)
def handle(self, *args, **options): """Refreshes the metadata for all experiments, or experiments from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: experiments = Experiment.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] experiments = Experiment.objects.filter( source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(experiments, PAGE_SIZE) page = paginator.page() while True: for experiment in page.object_list: logger.debug("Refreshing metadata for an experiment.", experiment=experiment.accession_code) try: if experiment.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( experiment.samples.first().accession_code) SraSurveyor._apply_metadata_to_experiment( experiment, metadata) elif experiment.source_database == "GEO": gse = GEOparse.get_GEO( experiment.accession_code, destdir="/tmp/management", silent=True, ) GeoSurveyor._apply_metadata_to_experiment( experiment, gse) elif experiment.source_database == "ARRAY_EXPRESS": request_url = EXPERIMENTS_URL + experiment.accession_code experiment_request = utils.requests_retry_session( ).get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error( "Remote experiment has no Experiment data!", experiment_accession_code=experiment. accession_code, survey_job=self.survey_job.id, ) continue ArrayExpressSurveyor._apply_metadata_to_experiment( experiment, parsed_json) experiment.save() # If there are any errors, just continue. It's likely that it's # just a problem with this experiment. except Exception: logger.exception( "exception caught while updating metadata for {}". format(experiment.accession_code)) if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)
def handle(self, *args, **options): """Refreshes the metadata for all samples, or samples from a specific database """ possible_source_databases = ["ARRAY_EXPRESS", "GEO", "SRA"] if options.get("source_database", None) is None: samples = Sample.objects.all() elif options["source_database"] in possible_source_databases: source_database = options["source_database"] samples = Sample.objects.filter(source_database=source_database) else: logger.error('Invalid source database "{}"'.format( options["source_database"]) + "\nPossible source databases: {}".format(", ".join( possible_source_databases))) sys.exit(1) paginator = PerformantPaginator(samples, PAGE_SIZE) page = paginator.page() while True: for sample in samples: logger.debug("Refreshing metadata for a sample.", sample=sample.accession_code) if sample.source_database == "SRA": metadata = SraSurveyor.gather_all_metadata( sample.accession_code) SraSurveyor._apply_harmonized_metadata_to_sample( sample, metadata) elif sample.source_database == "GEO": gse = GEOparse.get_GEO( sample.experiments.first().accession_code, destdir="/tmp/management", how="brief", silent=True, ) preprocessed_samples = harmony.preprocess_geo( gse.gsms.items()) harmonized_samples = harmony.harmonize( preprocessed_samples) GeoSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) elif sample.source_database == "ARRAY_EXPRESS": SDRF_URL_TEMPLATE = ( "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.sdrf.txt" ) sdrf_url = SDRF_URL_TEMPLATE.format( code=sample.experiments.first().accession_code) sdrf_samples = harmony.parse_sdrf(sdrf_url) harmonized_samples = harmony.harmonize(sdrf_samples) ArrayExpressSurveyor._apply_harmonized_metadata_to_sample( sample, harmonized_samples[sample.title]) sample.save() if not page.has_next(): break else: page = paginator.page(page.next_page_number()) # 2000 samples queued up every five minutes should be fast # enough and also not thrash the DB. time.sleep(60 * 5)