def test_geo_experiment_missing_metadata(self): """Tests that a GEO experiment has its missing metadata added.""" # 1. Create an experiment with a bad title BAD_TITLE = "GEO accession GSE1337 is currently private\ and is scheduled to be released on Jan 01, 1970." experiment = Experiment() experiment.accession_code = "GSE11915" experiment.source_database = "GEO" experiment.title = BAD_TITLE experiment.save() # 2. Setup is done, actually run the command. command = Command() command.handle() # Test that the title was fixed self.assertNotEqual( Experiment.objects.get_or_create( accession_code=experiment.accession_code)[0].title, BAD_TITLE, ) # Run the command again to make sure that it does not fail if there are no changes command = Command() command.handle()
def test_sra_experiment_missing_alternate_accession(self): """Tests that an SRA experiment has its missing alternate_accession_code added.""" # 1. Create an experiment without an alternate_accession_code experiment = Experiment() experiment.accession_code = "SRP094947" experiment.source_database = "SRA" experiment.title = "Not important" experiment.save() # 2. We need to add a sample because the way that the SRA surveyor finds metadata is # through run accessions sample = Sample() sample.accession_code = "SRR5099111" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample) # 3. Setup is done, actually run the command. command = Command() command.handle() # 4. Refresh the experiment experiment.refresh_from_db() # Test that the correct alternate_accession_code was added self.assertEquals(experiment.alternate_accession_code, "GSE92260")
def test_qn_management_command(self): """Test that the management command fires off and then does not create a job for an organism that does not have enough samples on the same platform.""" homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() codes = ["1", "2", "3", "4", "5", "6"] # We don't have a 0.tsv for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() out = StringIO() try: call_command("create_qn_target", organism="homo_sapiens", min=1, stdout=out) except SystemExit as e: # this is okay! pass stdout = out.getvalue() self.assertFalse("Target file" in stdout) # There's not enough samples available in this scenario so we # shouldn't have even made a processor job. self.assertEqual(ProcessorJob.objects.count(), 0)
def test_processed_samples_only(self): """ Don't return unprocessed samples """ experiment = Experiment() experiment.accession_code = "GSX12345" experiment.is_public = True experiment.save() sample = Sample() sample.title = "I am unprocessed" sample.accession_code = "GSXUnprocessed" sample.is_processed = False sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() # we return all experiments response = self.client.get( reverse("search", kwargs={"version": API_VERSION}), {"search": "GSX12345"} ) self.assertEqual(response.json()["count"], 1) # check requesting only experiments with processed samples response = self.client.get( reverse("search", kwargs={"version": API_VERSION}), {"search": "GSX12345", "num_processed_samples__gt": 0}, ) self.assertEqual(response.json()["count"], 0) sample2 = Sample() sample2.title = "I am processed" sample2.accession_code = "GSXProcessed" sample2.is_processed = True sample2.save() experiment_sample2_association = ExperimentSampleAssociation() experiment_sample2_association.sample = sample2 experiment_sample2_association.experiment = experiment experiment_sample2_association.save() # update cached values experiment.num_total_samples = 2 experiment.num_processed_samples = 1 experiment.save() response = self.client.get( reverse("search", kwargs={"version": API_VERSION}), {"search": "GSX12345"} ) self.assertEqual(response.json()["count"], 1) self.assertEqual(len(experiment.processed_samples), 1) experiment.delete() sample.delete() sample2.delete()
def make_test_data(organism): experiment = Experiment() experiment.accession_code = "GSE51088" experiment.technology = "RNA-SEQ" experiment.save() xoa = ExperimentOrganismAssociation() xoa.experiment = experiment xoa.organism = organism xoa.save() result = ComputationalResult() result.save() sample = Sample() sample.accession_code = "GSM1237818" sample.title = "GSM1237818" sample.organism = organism sample.technology = "RNA-SEQ" sample.is_processed = True sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf" computed_file.result = result computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.sha1 = ( "08c7ea90b66b52f7cd9d9a569717a1f5f3874967" # this matches with the downloaded file ) computed_file.save() computed_file = ComputedFile() computed_file.filename = "logquant.tsv" computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.result = result computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save()
def test_qn_reference(self, mock_send_job): organism = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) organism.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() for code in [str(i) for i in range(1, 401)]: sample = Sample() sample.accession_code = code sample.title = code sample.platform_name = f"Affymetrix {organism.name}" sample.platform_accession_code = f"A-MEXP-{organism.name}" sample.manufacturer = "AFFYMETRIX" sample.organism = organism sample.technology = "MICROARRAY" sample.is_processed = True sample.has_raw = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() # We need more than one organism for the tests, but can't # repeat accesion codes, so halfway through just change the organism. if int(code) == 200: organism = Organism(name="MUS_MUSCULUS", taxonomy_id=111) organism.save() # Setup is done, actually run the command. command = Command() command.handle(organisms="HOMO_SAPIENS,MUS_MUSCULUS") self.assertEqual(len(mock_send_job.mock_calls), 2) self.assertEqual(ProcessorJob.objects.count(), 2)
def setUp(self): experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() self.experiment = experiment # Create some samples to attach keywords to sample = Sample() sample.accession_code = "SRR123" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() sample2 = Sample() sample2.accession_code = "SRR456" sample2.technology = "RNA-SEQ" sample2.source_database = "SRA" sample2.title = "Not important" sample2.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample2 experiment_sample_association.experiment = experiment experiment_sample_association.save() # Create the ontology terms I'm using in the tests name = OntologyTerm() name.ontology_term = "PATO:0000122" name.human_readable_name = "length" name.save() unit = OntologyTerm() unit.ontology_term = "UO:0010012" unit.human_readable_name = "thou" unit.save() contribution = Contribution() contribution.source_name = "refinebio_tests" contribution.methods_url = "ccdatalab.org" contribution.save() self.contribution = contribution
def prepare_experiment(ids: List[int]) -> Experiment: (homo_sapiens, _) = Organism.objects.get_or_create(name="HOMO_SAPIENS", taxonomy_id=9606) experiment = Experiment() experiment.accession_code = "12345" experiment.save() codes = [str(i) for i in ids] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save()
def test_sra_experiment_missing_metadata(self): """Tests that an SRA experiment has its missing metadata added.""" # 1. Create an experiment with a bad title BAD_TITLE = "GEO accession GSE1337 is currently private\ and is scheduled to be released on Jan 01, 1970." experiment = Experiment() experiment.accession_code = "DRP003977" experiment.source_database = "SRA" experiment.title = BAD_TITLE experiment.save() # 2. We need to add a sample because the way that the SRA surveyor finds metadata is # through run accessions sample = Sample() sample.accession_code = "DRR002116" sample.technology = "RNA-SEQ" sample.source_database = "SRA" sample.title = "Not important" sample.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment, sample=sample) # 3. Setup is done, actually run the command. command = Command() command.handle() # Test that the title was fixed self.assertNotEqual( Experiment.objects.get_or_create( accession_code=experiment.accession_code)[0].title, BAD_TITLE, ) # Run the command again to make sure that it does not fail if there are no changes command = Command() command.handle()
def test_no_repeat_jobs(self): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object = Sample() sample_object.accession_code = "Sample1" sample_object.platform_accession_code = "Illumina Genome Analyzer" sample_object.platform_accession_name = "Illumina Genome Analyzer" sample_object.technology = "RNA-SEQ" sample_object.manufacturer = "ILLUMINA" sample_object.source_database = "SRA" sample_object.save() original_file_1 = OriginalFile() original_file_1.source_url = "first_url" original_file_1.source_filename = "first_filename" original_file_1.is_downloaded = False original_file_1.has_raw = True original_file_1.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_1 original_file_sample_association.sample = sample_object original_file_sample_association.save() original_file_2 = OriginalFile() original_file_2.source_url = "second_url" original_file_2.source_filename = "second_filename" original_file_2.is_downloaded = False original_file_2.has_raw = True original_file_2.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file_2 original_file_sample_association.sample = sample_object original_file_sample_association.save() dlj = DownloaderJob() dlj.save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_1 ).save() DownloaderJobOriginalFileAssociation( downloader_job=dlj, original_file=original_file_2 ).save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( [original_file_1, original_file_2], experiment_object.accession_code ) # We made one DownloaderJob in this test, so # queue_downloader_job_for_original_files didn't have anything # to do, so there should still be only one: self.assertEqual(1, DownloaderJob.objects.all().count())
def test_queue_downloader_jobs_for_original_files(self, mock_send_task): """Make sure that queue_downloader_jobs queues all expected Downloader jobs for a given experiment. """ # First, create an experiment with two samples associated with it # and create two original files for each of those samples. experiment_object = Experiment() experiment_object.accession_code = "Experiment1" experiment_object.save() sample_object_1 = Sample() sample_object_1.accession_code = "Sample1" sample_object_1.platform_accession_code = "Illumina Genome Analyzer" sample_object_1.platform_accession_name = "Illumina Genome Analyzer" sample_object_1.technology = "RNA-SEQ" sample_object_1.manufacturer = "ILLUMINA" sample_object_1.source_database = "SRA" sample_object_1.save() sample_object_2 = Sample() sample_object_2.accession_code = "Sample2" sample_object_2.platform_accession_code = "Illumina Genome Analyzer" sample_object_2.platform_accession_name = "Illumina Genome Analyzer" sample_object_2.technology = "RNA-SEQ" sample_object_2.manufacturer = "ILLUMINA" sample_object_2.source_database = "SRA" sample_object_2.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_1 association.save() association = ExperimentSampleAssociation() association.experiment = experiment_object association.sample = sample_object_2 association.save() sample_1_original_files = [] sample_2_original_files = [] original_file = OriginalFile() original_file.source_url = "first_url" original_file.source_filename = "first_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_1_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "second_url" original_file.source_filename = "second_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_1 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "third_url" original_file.source_filename = "third_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() original_file = OriginalFile() original_file.source_url = "fourth_url" original_file.source_filename = "fourth_filename" original_file.is_downloaded = False original_file.has_raw = True original_file.save() sample_2_original_files.append(original_file) original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.original_file = original_file original_file_sample_association.sample = sample_object_2 original_file_sample_association.save() survey_job = SurveyJob(source_type="SRA") survey_job.save() surveyor = SraSurveyor(survey_job) surveyor.queue_downloader_job_for_original_files( sample_1_original_files, experiment_object.accession_code ) surveyor.queue_downloader_job_for_original_files( sample_2_original_files, experiment_object.accession_code ) self.assertEqual(DownloaderJob.objects.all().count(), 2)
def test_dualtech_smash(self): """ """ pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() # CROSS-SMASH BY SPECIES ds = Dataset() ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() self.assertTrue(ds.is_cross_technology()) final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 2) # THEN BY EXPERIMENT ds.aggregate_by = 'EXPERIMENT' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 1) # THEN BY ALL ds.aggregate_by = 'ALL' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) self.assertEqual(len(final_context['final_frame'].columns), 2)
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code GeoSurveyor._apply_metadata_to_experiment(experiment_object, gse) experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id, ) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata["organism_ch1"][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( "data_processing", None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata["title"][0] self.set_platform_properties(sample_object, sample.metadata, gse) GeoSurveyor._apply_harmonized_metadata_to_sample( sample_object, harmonized_samples[sample_object.title]) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( "supplementary_file", []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if (".cel" in lower_file_url or ("_non_normalized.txt" in lower_file_url) or ("_non-normalized.txt" in lower_file_url) or ("-non-normalized.txt" in lower_file_url) or ("-non_normalized.txt" in lower_file_url)): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = FileUtils.get_filename(supplementary_file_url) original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=FileUtils.is_archive(filename), )[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = "MICROARRAY" sample_object.manufacturer = "AFFYMETRIX" sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != "RNA-SEQ": created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( "supplementary_file", []): # filename and source_filename are the same for these filename = experiment_supplement_url.split("/")[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True, )[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if (("_non_normalized.txt" in lower_supplement_url) or ("_non-normalized.txt" in lower_supplement_url) or ("-non-normalized.txt" in lower_supplement_url) or ("-non_normalized.txt" in lower_supplement_url)): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0): original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split("/")[-1], has_raw=sample_object.has_raw, is_archive=True, )[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if (OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0): miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"]) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678")
def test_log2(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Has non-log2 data: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421 # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz experiment = Experiment() experiment.accession_code = "GSE44421" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1084806' sample.title = 'GSM1084806' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084806-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1084807' sample.title = 'GSM1084807' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084807-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertTrue(final_context['success'])
def test_no_smash_all_diff_species(self): """ Smashing together with 'ALL' with different species is a really weird behavior. This test isn't really testing a normal case, just make sure that it's marking the unsmashable files. """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() experiment = Experiment() experiment.accession_code = "GSE51084" experiment.save() mus_mus = Organism.get_object_for_name("MUS_MUSCULUS") sample = Sample() sample.accession_code = 'GSM1238108' sample.title = 'GSM1238108' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1238108-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") micros = [] for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'): if 'microarray.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'): if 'rnaseq.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv' qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data['organism_id'] = danio_rerio.id cra.data['is_qn'] = True cra.result = result cra.save() dset = Dataset() dset.data = {'GSE1234': micros, 'GSE5678': rnas} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual(len(final_context['computed_files']), 3) for file in final_context['computed_files']: self.assertTrue(os.path.exists(file.absolute_file_path))
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def create_experiment_and_samples_from_api( self, experiment_accession_code) -> (Experiment, List[Sample]): """ The main surveyor - find the Experiment and Samples from NCBI GEO. Uses the GEOParse library, for which docs can be found here: https://geoparse.readthedocs.io/en/latest/usage.html#working-with-geo-objects """ # Cleaning up is tracked here: https://github.com/guma44/GEOparse/issues/41 gse = GEOparse.get_GEO(experiment_accession_code, destdir=self.get_temp_path(), how="brief", silent=True) preprocessed_samples = harmony.preprocess_geo(gse.gsms.items()) harmonized_samples = harmony.harmonize(preprocessed_samples) # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment %s already exists, skipping object creation.", experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ( "https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=" + experiment_accession_code) experiment_object.source_database = "GEO" experiment_object.title = gse.metadata.get('title', [''])[0] experiment_object.description = gse.metadata.get('summary', [''])[0] # Source doesn't provide time information, assume midnight. submission_date = gse.metadata["submission_date"][ 0] + " 00:00:00 UTC" experiment_object.source_first_published = dateutil.parser.parse( submission_date) last_updated_date = gse.metadata["last_update_date"][ 0] + " 00:00:00 UTC" experiment_object.source_last_updated = dateutil.parser.parse( last_updated_date) unique_institutions = list(set(gse.metadata["contact_institute"])) experiment_object.submitter_institution = ", ".join( unique_institutions) experiment_object.pubmed_id = gse.metadata.get("pubmed_id", [""])[0] # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = gse.metadata experiment_annotation.experiment = experiment_object experiment_annotation.is_ccdl = False experiment_annotation.save() # Okay, here's the situation! # Sometimes, samples have a direct single representation for themselves. # Othertimes, there is a single file with references to every sample in it. created_samples = [] for sample_accession_code, sample in gse.gsms.items(): try: sample_object = Sample.objects.get( accession_code=sample_accession_code) logger.debug( "Sample %s from experiment %s already exists, skipping object creation.", sample_accession_code, experiment_object.accession_code, survey_job=self.survey_job.id) # Associate it with the experiment, but since it # already exists it already has original files # associated with it and it's already been downloaded, # so don't add it to created_samples. ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=sample_object.organism) except Sample.DoesNotExist: organism = Organism.get_object_for_name( sample.metadata['organism_ch1'][0].upper()) sample_object = Sample() sample_object.source_database = "GEO" sample_object.accession_code = sample_accession_code sample_object.organism = organism # If data processing step, it isn't raw. sample_object.has_raw = not sample.metadata.get( 'data_processing', None) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) sample_object.title = sample.metadata['title'][0] self.set_platform_properties(sample_object, sample.metadata, gse) # Directly assign the harmonized properties harmonized_sample = harmonized_samples[sample_object.title] for key, value in harmonized_sample.items(): setattr(sample_object, key, value) # Sample-level protocol_info sample_object.protocol_info = self.get_sample_protocol_info( sample.metadata, sample_accession_code) sample_object.save() logger.debug("Created Sample: " + str(sample_object)) sample_annotation = SampleAnnotation() sample_annotation.sample = sample_object sample_annotation.data = sample.metadata sample_annotation.is_ccdl = False sample_annotation.save() sample_supplements = sample.metadata.get( 'supplementary_file', []) for supplementary_file_url in sample_supplements: # Why do they give us this? if supplementary_file_url == "NONE": break # We never want these! if "idat.gz" in supplementary_file_url.lower(): continue if "chp.gz" in supplementary_file_url.lower(): continue if "ndf.gz" in supplementary_file_url.lower(): continue if "pos.gz" in supplementary_file_url.lower(): continue if "pair.gz" in supplementary_file_url.lower(): continue if "gff.gz" in supplementary_file_url.lower(): continue # Sometimes, we are lied to about the data processing step. lower_file_url = supplementary_file_url.lower() if '.cel' in lower_file_url \ or ('_non_normalized.txt' in lower_file_url) \ or ('_non-normalized.txt' in lower_file_url) \ or ('-non-normalized.txt' in lower_file_url) \ or ('-non_normalized.txt' in lower_file_url): sample_object.has_raw = True sample_object.save() # filename and source_filename are the same for these filename = supplementary_file_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=supplementary_file_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) if original_file.is_affy_data(): # Only Affymetrix Microarrays produce .CEL files sample_object.technology = 'MICROARRAY' sample_object.manufacturer = 'AFFYMETRTIX' sample_object.save() # It's okay to survey RNA-Seq samples from GEO, but we # don't actually want to download/process any RNA-Seq # data unless it comes from SRA. if sample_object.technology != 'RNA-SEQ': created_samples.append(sample_object) # Now that we've determined the technology at the # sample level, we can set it at the experiment level, # just gotta make sure to only do it once. There can # be more than one technology, this should be changed # as part of: # https://github.com/AlexsLemonade/refinebio/issues/1099 if not experiment_object.technology: experiment_object.technology = sample_object.technology experiment_object.save() ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) # These supplementary files _may-or-may-not_ contain the type of raw data we can process. for experiment_supplement_url in gse.metadata.get( 'supplementary_file', []): # filename and source_filename are the same for these filename = experiment_supplement_url.split('/')[-1] original_file = OriginalFile.objects.get_or_create( source_url=experiment_supplement_url, filename=filename, source_filename=filename, has_raw=sample_object.has_raw, is_archive=True)[0] logger.debug("Created OriginalFile: " + str(original_file)) lower_supplement_url = experiment_supplement_url.lower() if ('_non_normalized.txt' in lower_supplement_url) \ or ('_non-normalized.txt' in lower_supplement_url) \ or ('-non-normalized.txt' in lower_supplement_url) \ or ('-non_normalized.txt' in lower_supplement_url): for sample_object in created_samples: sample_object.has_raw = True sample_object.save() OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=original_file).count() == 0: original_file.delete() # These are the Miniml/Soft/Matrix URLs that are always(?) provided. # GEO describes different types of data formatting as "families" family_url = self.get_miniml_url(experiment_accession_code) miniml_original_file = OriginalFile.objects.get_or_create( source_url=family_url, source_filename=family_url.split('/')[-1], has_raw=sample_object.has_raw, is_archive=True)[0] for sample_object in created_samples: # We don't need a .txt if we have a .CEL if sample_object.has_raw: continue OriginalFileSampleAssociation.objects.get_or_create( sample=sample_object, original_file=miniml_original_file) # Delete this Original file if it isn't being used. if OriginalFileSampleAssociation.objects.filter( original_file=miniml_original_file).count() == 0: miniml_original_file.delete() # Trash the temp path try: shutil.rmtree(self.get_temp_path()) except Exception: # There was a problem during surveying so this didn't get created. # It's not a big deal. pass return experiment_object, created_samples
def create_experiment_from_api( self, experiment_accession_code: str) -> (Experiment, Dict): """Given an experiment accession code, create an Experiment object. Also returns a dictionary of additional information about the platform discovered for the experiment. Will raise an UnsupportedPlatformException if this experiment was conducted using a platform which we don't support. See an example at: https://www.ebi.ac.uk/arrayexpress/json/v3/experiments/E-MTAB-3050/sample """ request_url = EXPERIMENTS_URL + experiment_accession_code experiment_request = utils.requests_retry_session().get(request_url, timeout=60) try: parsed_json = experiment_request.json( )["experiments"]["experiment"][0] except KeyError: logger.error("Remote experiment has no Experiment data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise experiment = {} experiment["name"] = parsed_json["name"] experiment["experiment_accession_code"] = experiment_accession_code # This experiment has no platform at all, and is therefore useless. if 'arraydesign' not in parsed_json or len( parsed_json["arraydesign"]) == 0: logger.warn("Remote experiment has no arraydesign listed.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) raise UnsupportedPlatformException # If there is more than one arraydesign listed in the experiment # then there is no other way to determine which array was used # for which sample other than looking at the header of the CEL # file. That obviously cannot happen until the CEL file has been # downloaded so we can just mark it as UNKNOWN and let the # downloader inspect the downloaded file to determine the # array then. elif len(parsed_json["arraydesign"] ) != 1 or "accession" not in parsed_json["arraydesign"][0]: experiment["platform_accession_code"] = UNKNOWN experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN else: external_accession = parsed_json["arraydesign"][0]["accession"] for platform in get_supported_microarray_platforms(): if platform["external_accession"] == external_accession: experiment[ "platform_accession_code"] = get_normalized_platform( platform["platform_accession"]) # Illumina appears in the accession codes for # platforms manufactured by Illumina if "ILLUMINA" in experiment[ "platform_accession_code"].upper(): experiment["manufacturer"] = "ILLUMINA" experiment["platform_accession_name"] = platform[ "platform_accession"] else: # It's not Illumina, the only other supported Microarray platform is # Affy. As our list of supported platforms grows this logic will # need to get more sophisticated. experiment["manufacturer"] = "AFFYMETRIX" platform_mapping = get_readable_affymetrix_names() experiment[ "platform_accession_name"] = platform_mapping[ platform["platform_accession"]] if "platform_accession_code" not in experiment: # We don't know what platform this accession corresponds to. experiment["platform_accession_code"] = external_accession experiment["platform_accession_name"] = UNKNOWN experiment["manufacturer"] = UNKNOWN experiment["release_date"] = parsed_json["releasedate"] if "lastupdatedate" in parsed_json: experiment["last_update_date"] = parsed_json["lastupdatedate"] else: experiment["last_update_date"] = parsed_json["releasedate"] # Create the experiment object try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: # We aren't sure these fields will be populated, or how many there will be. # Try to join them all together, or set a sensible default. experiment_descripton = "" if "description" in parsed_json and len( parsed_json["description"]) > 0: for description_item in parsed_json["description"]: if "text" in description_item: experiment_descripton = experiment_descripton + description_item[ "text"] + "\n" if experiment_descripton == "": experiment_descripton = "Description not available.\n" experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = request_url experiment_object.source_database = "ARRAY_EXPRESS" experiment_object.title = parsed_json["name"] # This will need to be updated if we ever use Array # Express to get other kinds of data. experiment_object.technology = "MICROARRAY" experiment_object.description = experiment_descripton experiment_object.source_first_published = parse_datetime( experiment["release_date"]) experiment_object.source_last_modified = parse_datetime( experiment["last_update_date"]) experiment_object.save() json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = parsed_json json_xa.is_ccdl = False json_xa.save() ## Fetch and parse the IDF/SDRF file for any other fields IDF_URL_TEMPLATE = "https://www.ebi.ac.uk/arrayexpress/files/{code}/{code}.idf.txt" idf_url = IDF_URL_TEMPLATE.format(code=experiment_accession_code) idf_text = utils.requests_retry_session().get(idf_url, timeout=60).text lines = idf_text.split('\n') idf_dict = {} for line in lines: keyval = line.strip().split('\t') if len(keyval) == 2: idf_dict[keyval[0]] = keyval[1] elif len(keyval) > 2: idf_dict[keyval[0]] = keyval[1:] idf_xa = ExperimentAnnotation() idf_xa.data = idf_dict idf_xa.experiment = experiment_object idf_xa.is_ccdl = False idf_xa.save() if 'Investigation Title' in idf_dict: experiment_object.title = idf_dict['Investigation Title'] if 'Person Affiliation' in idf_dict: # This is very rare, ex: E-MEXP-32 if isinstance(idf_dict['Person Affiliation'], list): unique_people = list(set(idf_dict['Person Affiliation'])) experiment_object.submitter_institution = ", ".join( unique_people)[:255] else: experiment_object.submitter_institution = idf_dict[ 'Person Affiliation'] # Get protocol_description from "<experiment_url>/protocols" # instead of from idf_dict, because the former provides more # details. protocol_url = request_url + '/protocols' protocol_request = utils.requests_retry_session().get(protocol_url, timeout=60) try: experiment_object.protocol_description = protocol_request.json( )['protocols'] except KeyError: logger.warning( "Remote experiment has no protocol data!", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) if 'Publication Title' in idf_dict: # This will happen for some superseries. # Ex: E-GEOD-29536 # Assume most recent is "best:, store the rest in experiment annotation. if isinstance(idf_dict['Publication Title'], list): experiment_object.publication_title = "; ".join( idf_dict['Publication Title']) else: experiment_object.publication_title = idf_dict[ 'Publication Title'] experiment_object.has_publication = True if 'Publication DOI' in idf_dict: if isinstance(idf_dict['Publication DOI'], list): experiment_object.publication_doi = ", ".join( idf_dict['Publication DOI']) else: experiment_object.publication_doi = idf_dict[ 'Publication DOI'] experiment_object.has_publication = True if 'PubMed ID' in idf_dict: if isinstance(idf_dict['PubMed ID'], list): experiment_object.pubmed_id = ", ".join( idf_dict['PubMed ID']) else: experiment_object.pubmed_id = idf_dict['PubMed ID'] experiment_object.has_publication = True # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() platform_dict = {} for k in ('platform_accession_code', 'platform_accession_name', 'manufacturer'): platform_dict[k] = experiment[k] return experiment_object, platform_dict
def test_make_experiment_result_associations(self): """Tests that the correct associations are made. The situation we're setting up is basically this: * tximport has been run for an experiment. * It made associations between the samples in the experiment and the ComputationalResult. * It didn't make associations between the experiment itself and the ComputationalResult. * There is a second experiment that hasn't had tximport run but shares a sample with the other experiment. * This second experiment has a sample which has not yet had tximport run on it. And what we're going to test for is: * An association is created between the tximport result and the first experiment. * An association is NOT created between the tximport result and the second experiment. """ # Get an organism to set on samples: homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) # Create the tximport processor and result: processor = Processor() processor.name = "Tximport" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() result = ComputationalResult() result.commands.append("tximport invocation") result.is_ccdl = True result.processor = processor result.save() # Create the first experiment and it's samples: processed_experiment = Experiment() processed_experiment.accession_code = "SRP12345" processed_experiment.save() processed_sample_one = Sample() processed_sample_one.accession_code = "SRX12345" processed_sample_one.title = "SRX12345" processed_sample_one.organism = homo_sapiens processed_sample_one.save() sra = SampleResultAssociation() sra.sample = processed_sample_one sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_one esa.save() processed_sample_two = Sample() processed_sample_two.accession_code = "SRX12346" processed_sample_two.title = "SRX12346" processed_sample_two.organism = homo_sapiens processed_sample_two.save() sra = SampleResultAssociation() sra.sample = processed_sample_two sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_two esa.save() # Create the second experiment and it's additional sample. unprocessed_experiment = Experiment() unprocessed_experiment.accession_code = "SRP6789" unprocessed_experiment.save() unprocessed_sample = Sample() unprocessed_sample.accession_code = "SRX6789" unprocessed_sample.title = "SRX6789" unprocessed_sample.organism = homo_sapiens unprocessed_sample.save() sra = SampleResultAssociation() sra.sample = unprocessed_sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = unprocessed_sample esa.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = processed_sample_two esa.save() # Run the function we're testing: make_experiment_result_associations() # Test that only one association was created and that it was # to the processed experiment: eras = ExperimentResultAssociation.objects.all() self.assertEqual(len(eras), 1) self.assertEqual(eras.first().experiment, processed_experiment)
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487222_empty.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRS332914" sample2.title = "SRS332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRX332914": ["SRS332914"] } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertFalse(job.success) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id)
def test_no_smash_dupe(self): """ """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237811' sample.title = 'GSM1237811' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) self.assertTrue(ds.success) for column in final_context['original_merged'].columns: self.assertTrue('_x' not in column)
def test_qn_reference(self): job = ProcessorJob() job.pipeline_applied = "QN_REFERENCE" job.save() homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606) homo_sapiens.save() experiment = Experiment() experiment.accession_code = "12345" experiment.save() # We don't have a 0.tsv codes = [str(i) for i in range(1, 201)] for code in codes: sample = Sample() sample.accession_code = code sample.title = code sample.platform_accession_code = "A-MEXP-1171" sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS" sample.organism = homo_sapiens sample.technology = "MICROARRAY" sample.is_processed = True sample.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = code + ".tsv" computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv" computed_file.size_in_bytes = int(code) computed_file.result = cr computed_file.is_smashable = True computed_file.save() scfa = SampleComputedFileAssociation() scfa.sample = sample scfa.computed_file = computed_file scfa.save() exsa = ExperimentSampleAssociation() exsa.experiment = experiment exsa.sample = sample exsa.save() dataset = Dataset() dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]} dataset.aggregate_by = "ALL" dataset.scale_by = "NONE" dataset.quantile_normalize = False # We don't QN because we're creating the target now dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() final_context = qn_reference.create_qn_reference(job.pk) self.assertTrue(final_context["success"]) self.assertTrue(os.path.exists(final_context["target_file"])) self.assertEqual(os.path.getsize(final_context["target_file"]), 562) homo_sapiens.refresh_from_db() target = homo_sapiens.qn_target.computedfile_set.latest() self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a") # Create and run a smasher job that will use the QN target we just made. pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() ds = Dataset() ds.data = {"12345": ["1", "2", "3", "4", "5"]} ds.aggregate_by = "SPECIES" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(final_context["success"]) np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811) np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
def test_no_smash_dupe_two(self): """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file.""" job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "SRP051449" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") sample = Sample() sample.accession_code = 'SRR1731761' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'SRR1731762' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'NONE' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = "danio_target.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = cr computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() cra = ComputationalResultAnnotation() cra.data = {'organism_id': danio_rerio.id, 'is_qn': True} cra.result = cr cra.save() final_context = smasher.smash(job.pk, upload=False) self.assertTrue(final_context['success'])
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def _generate_experiment_and_samples( self, run_accession: str, study_accession: str = None) -> (Experiment, List[Sample]): """Generates Experiments and Samples for the provided run_accession.""" metadata = SraSurveyor.gather_all_metadata(run_accession) if metadata == {}: if study_accession: logger.error("Could not discover any metadata for run.", accession=run_accession, study_accession=study_accession) else: logger.error("Could not discover any metadata for run.", accession=run_accession) return (None, None) # This will cascade properly if DOWNLOAD_SOURCE == "ENA": if metadata["library_layout"] == "PAIRED": files_urls = [ SraSurveyor._build_ena_file_url(run_accession, "_1"), SraSurveyor._build_ena_file_url(run_accession, "_2") ] else: files_urls = [SraSurveyor._build_ena_file_url(run_accession)] else: files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)] # Figure out the Organism for this sample organism_name = metadata.pop("organism_name", None) if not organism_name: logger.error("Could not discover organism type for run.", accession=run_accession) return (None, None) # This will cascade properly organism_name = organism_name.upper() organism = Organism.get_object_for_name(organism_name) ## # Experiment ## experiment_accession_code = metadata.get('study_accession') try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code experiment_object.source_url = ENA_URL_TEMPLATE.format( experiment_accession_code) experiment_object.source_database = "SRA" experiment_object.technology = "RNA-SEQ" # We don't get this value from the API, unfortunately. # experiment_object.platform_accession_code = experiment["platform_accession_code"] if not experiment_object.description: experiment_object.description = "No description." if "study_title" in metadata: experiment_object.title = metadata["study_title"] if "study_abstract" in metadata: experiment_object.description = metadata["study_abstract"] if "lab_name" in metadata: experiment_object.submitter_institution = metadata["lab_name"] if "experiment_design_description" in metadata: experiment_object.protocol_description = metadata[ "experiment_design_description"] if "pubmed_id" in metadata: experiment_object.pubmed_id = metadata["pubmed_id"] experiment_object.has_publication = True if "study_ena_first_public" in metadata: experiment_object.source_first_published = parse_datetime( metadata["study_ena_first_public"]) if "study_ena_last_update" in metadata: experiment_object.source_last_modified = parse_datetime( metadata["study_ena_last_update"]) # Rare, but it happens. if not experiment_object.protocol_description: experiment_object.protocol_description = metadata.get( "library_construction_protocol", "Protocol was never provided.") # Scrape publication title and authorship from Pubmed if experiment_object.pubmed_id: pubmed_metadata = utils.get_title_and_authors_for_pubmed_id( experiment_object.pubmed_id) experiment_object.publication_title = pubmed_metadata[0] experiment_object.publication_authors = pubmed_metadata[1] experiment_object.save() ## # Experiment Metadata ## json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = metadata json_xa.is_ccdl = False json_xa.save() ## # Samples ## sample_accession_code = metadata.pop('run_accession') # Create the sample object try: sample_object = Sample.objects.get( accession_code=sample_accession_code) # If current experiment includes new protocol information, # merge it into the sample's existing protocol_info. protocol_info, is_updated = self.update_sample_protocol_info( sample_object.protocol_info, experiment_object.protocol_description, experiment_object.source_url) if is_updated: sample_object.protocol_info = protocol_info sample_object.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment_object.accession_code, survey_job=self.survey_job.id) except Sample.DoesNotExist: sample_object = Sample() sample_object.source_database = "SRA" sample_object.accession_code = sample_accession_code sample_object.organism = organism sample_object.platform_name = metadata.get( "platform_instrument_model", "UNKNOWN") # The platform_name is human readable and contains spaces, # accession codes shouldn't have spaces though: sample_object.platform_accession_code = sample_object.platform_name.replace( " ", "") sample_object.technology = "RNA-SEQ" if "ILLUMINA" in sample_object.platform_name.upper() \ or "NEXTSEQ" in sample_object.platform_name.upper(): sample_object.manufacturer = "ILLUMINA" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = "UNKNOWN" # Directly apply the harmonized values sample_object.title = harmony.extract_title(metadata) harmonized_sample = harmony.harmonize([metadata]) for key, value in harmonized_sample.items(): setattr(sample_object, key, value) protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment_object.protocol_description, experiment_url=experiment_object.source_url) # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() for file_url in files_urls: original_file = OriginalFile.objects.get_or_create( source_url=file_url, source_filename=file_url.split('/')[-1], has_raw=True)[0] original_file_sample_association = OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) return experiment_object, [sample_object]
def _generate_experiment_and_samples( self, run_accession: str, study_accession: str = None) -> (Experiment, List[Sample]): """Generates Experiments and Samples for the provided run_accession.""" metadata = SraSurveyor.gather_all_metadata(run_accession) if metadata == {}: if study_accession: logger.error( "Could not discover any metadata for run.", accession=run_accession, study_accession=study_accession, ) else: logger.error("Could not discover any metadata for run.", accession=run_accession) return (None, None) # This will cascade properly if DOWNLOAD_SOURCE == "ENA": if metadata["library_layout"] == "PAIRED": files_urls = [ _build_ena_file_url(run_accession, "_1"), _build_ena_file_url(run_accession, "_2"), ] else: files_urls = [_build_ena_file_url(run_accession)] else: files_urls = [SraSurveyor._build_ncbi_file_url(run_accession)] # Figure out the Organism for this sample organism_name = metadata.pop("organism_name", None) if not organism_name: logger.error("Could not discover organism type for run.", accession=run_accession) return (None, None) # This will cascade properly organism_name = organism_name.upper() organism = Organism.get_object_for_name(organism_name) ## # Experiment ## experiment_accession_code = metadata.get("study_accession") try: experiment_object = Experiment.objects.get( accession_code=experiment_accession_code) logger.debug( "Experiment already exists, skipping object creation.", experiment_accession_code=experiment_accession_code, survey_job=self.survey_job.id, ) except Experiment.DoesNotExist: experiment_object = Experiment() experiment_object.accession_code = experiment_accession_code SraSurveyor._apply_metadata_to_experiment(experiment_object, metadata) experiment_object.save() ## # Experiment Metadata ## json_xa = ExperimentAnnotation() json_xa.experiment = experiment_object json_xa.data = metadata json_xa.is_ccdl = False json_xa.save() ## # Samples ## sample_accession_code = metadata.pop("run_accession") # Create the sample object try: sample_object = Sample.objects.get( accession_code=sample_accession_code) # If current experiment includes new protocol information, # merge it into the sample's existing protocol_info. protocol_info, is_updated = self.update_sample_protocol_info( sample_object.protocol_info, experiment_object.protocol_description, experiment_object.source_url, ) if is_updated: sample_object.protocol_info = protocol_info sample_object.save() logger.debug( "Sample %s already exists, skipping object creation.", sample_accession_code, experiment_accession_code=experiment_object.accession_code, survey_job=self.survey_job.id, ) except Sample.DoesNotExist: sample_object = Sample() sample_object.source_database = "SRA" sample_object.accession_code = sample_accession_code sample_object.organism = organism sample_object.platform_name = metadata.get( "platform_instrument_model", "UNKNOWN") # The platform_name is human readable and contains spaces, # accession codes shouldn't have spaces though: sample_object.platform_accession_code = sample_object.platform_name.replace( " ", "") sample_object.technology = "RNA-SEQ" if ("ILLUMINA" in sample_object.platform_name.upper() or "NEXTSEQ" in sample_object.platform_name.upper()): sample_object.manufacturer = "ILLUMINA" elif "ION TORRENT" in sample_object.platform_name.upper(): sample_object.manufacturer = "ION_TORRENT" else: sample_object.manufacturer = "UNKNOWN" SraSurveyor._apply_harmonized_metadata_to_sample( sample_object, metadata) protocol_info, is_updated = self.update_sample_protocol_info( existing_protocols=[], experiment_protocol=experiment_object.protocol_description, experiment_url=experiment_object.source_url, ) # Do not check is_updated the first time because we must # save a list so we can append to it later. sample_object.protocol_info = protocol_info sample_object.save() for file_url in files_urls: original_file = OriginalFile.objects.get_or_create( source_url=file_url, source_filename=file_url.split("/")[-1], has_raw=True)[0] OriginalFileSampleAssociation.objects.get_or_create( original_file=original_file, sample=sample_object) # Create associations if they don't already exist ExperimentSampleAssociation.objects.get_or_create( experiment=experiment_object, sample=sample_object) ExperimentOrganismAssociation.objects.get_or_create( experiment=experiment_object, organism=organism) return experiment_object, [sample_object]
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj