def make_test_data(organism): experiment = Experiment() experiment.accession_code = "GSE51088" experiment.technology = "RNA-SEQ" experiment.save() xoa = ExperimentOrganismAssociation() xoa.experiment = experiment xoa.organism = organism xoa.save() result = ComputationalResult() result.save() sample = Sample() sample.accession_code = "GSM1237818" sample.title = "GSM1237818" sample.organism = organism sample.technology = "RNA-SEQ" sample.is_processed = True sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf" computed_file.result = result computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.sha1 = ( "08c7ea90b66b52f7cd9d9a569717a1f5f3874967" # this matches with the downloaded file ) computed_file.save() computed_file = ComputedFile() computed_file.filename = "logquant.tsv" computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.result = result computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save()
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append("SCAN.UPC::SCAN_TwoColor") result.is_ccdl = True result.is_public = True result.time_start = job_context["time_start"] result.time_end = job_context["time_end"] try: processor_key = "AGILENT_TWOCOLOR" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the sample, # sync it S3 and save it. try: computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split( job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context["computed_files"].append(computed_file) except Exception: logger.exception( "Exception caught while moving file %s to S3", computed_file.filename, processor_job=job_context["job_id"], ) failure_reason = "Exception caught while moving file to S3" job_context["job"].failure_reason = failure_reason job_context["success"] = False return job_context for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.info("Created %s", result) job_context["success"] = True return job_context
def test_fail(self): """ Test our ability to fail """ result = ComputationalResult() result.save() sample = Sample() sample.accession_code = 'XXX' sample.title = 'XXX' sample.organism = Organism.get_object_for_name("HOMO_SAPIENS") sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "NOT_REAL.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['XXX']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() dsid = ds.id job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertNotEqual(final_context['unsmashable_files'], [])
def _create_result_objects(job_context: Dict) -> Dict: """ Create the ComputationalResult objects after a Scan run is complete """ result = ComputationalResult() result.commands.append('SCAN.UPC::SCANfast') result.is_ccdl = True result.is_public = True result.time_start = job_context['time_start'] result.time_end = job_context['time_end'] try: processor_key = "AFFYMETRIX_SCAN" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Create a ComputedFile for the sample computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = os.path.split(job_context["output_file_path"])[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() job_context['computed_files'].append(computed_file) for sample in job_context['samples']: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) logger.debug("Created %s", result, processor_job=job_context["job_id"]) job_context["success"] = True return job_context
def _create_result(job_context: Dict) -> Dict: """ Create the actual Result object""" # This is a NO-OP, but we make a ComputationalResult regardless. result = ComputationalResult() result.commands.append(job_context["script_name"]) result.is_ccdl = True try: processor_key = "SUBMITTER_PROCESSED" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) # Create a ComputedFile for the computed file, # sync it S3 and save it. computed_file = ComputedFile() computed_file.absolute_file_path = job_context["output_file_path"] computed_file.filename = job_context["output_file_path"].split("/")[-1] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.save() # utils.end_job will sync this to S3 for us. job_context["computed_files"] = [computed_file] for sample in job_context["samples"]: assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file ) logger.debug("Created %s", result) job_context["success"] = True return job_context
def create_sample_for_experiment(sample_info: Dict, experiment: Experiment) -> Sample: result = ComputationalResult() result.save() sample = Sample() sample.accession_code = sample_info["accession_code"] sample.title = sample_info.get("title", None) or sample_info["accession_code"] sample.organism = sample_info["organism"] sample.technology = sample_info["technology"] sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() if sample_info.get("filename") is not None: computed_file = ComputedFile() computed_file.filename = sample_info["filename"] computed_file.absolute_file_path = sample_info[ "data_dir"] + sample_info["filename"] computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() return sample
def test_no_smash_dupe(self): """ """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237811' sample.title = 'GSM1237811' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) self.assertTrue(ds.success) for column in final_context['original_merged'].columns: self.assertTrue('_x' not in column)
def test_create_quantpendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() experiment = Experiment() experiment.accession_code = "GSE51088" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) sample = Sample() sample.accession_code = "GSM1237818" sample.title = "GSM1237818" sample.organism = homo_sapiens sample.technology = "RNA-SEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf" computed_file.result = result computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.sha1 = ( "08c7ea90b66b52f7cd9d9a569717a1f5f3874967" # this matches with the downloaded file ) computed_file.save() computed_file = ComputedFile() computed_file.filename = "logquant.tsv" computed_file.is_smashable = True computed_file.size_in_bytes = 123123 computed_file.result = result computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {"GSE51088": ["GSM1237818"]} ds.aggregate_by = "EXPERIMENT" ds.scale_by = "STANDARD" ds.email_address = "*****@*****.**" ds.quant_sf_only = True # Make the dataset include quant.sf files only ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = create_quantpendia(job.id) self.assertTrue( os.path.exists(final_context["output_dir"] + "/GSE51088/GSM1237818_quant.sf")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/README.md")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/LICENSE.TXT")) self.assertTrue( os.path.exists(final_context["output_dir"] + "/aggregated_metadata.json")) self.assertTrue(final_context["metadata"]["quant_sf_only"]) self.assertEqual(final_context["metadata"]["num_samples"], 1) self.assertEqual(final_context["metadata"]["num_experiments"], 1) # test that archive exists quantpendia_file = ComputedFile.objects.filter( is_compendia=True, quant_sf_only=True).latest() self.assertTrue(os.path.exists(quantpendia_file.absolute_file_path))
def test_no_smash_all_diff_species(self): """ Smashing together with 'ALL' with different species is a really weird behavior. This test isn't really testing a normal case, just make sure that it's marking the unsmashable files. """ job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() experiment = Experiment() experiment.accession_code = "GSE51084" experiment.save() mus_mus = Organism.get_object_for_name("MUS_MUSCULUS") sample = Sample() sample.accession_code = 'GSM1238108' sample.title = 'GSM1238108' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1238108-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']} ds.aggregate_by = 'ALL' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() final_context = smasher.smash(job.pk, upload=False) dsid = ds.id ds = Dataset.objects.get(id=dsid) print(ds.failure_reason) print(final_context['dataset'].failure_reason) self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
def setUpClass(cls): super(ESTestCases, cls).setUpClass() # ref https://stackoverflow.com/a/29655301/763705 """Set up class.""" experiment = Experiment() experiment.accession_code = "GSE000-X" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123-X" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.num_processed_samples = 1 # added below experiment.num_total_samples = 1 experiment.num_downloadable_samples = 1 experiment.save() experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.save() organism = Organism( name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True ) organism.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = organism sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() # associate the experiment with the sample experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() result = ComputationalResult() result.save() # and create a qn tarjet for the sample computational_result = ComputationalResultAnnotation() computational_result.result = result computational_result.data = {"is_qn": True, "organism_id": sample.organism.id} computational_result.save() # and associate it with the sample organism sample.organism.qn_target = result sample.organism.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() # clear default cache and reindex # otherwise the organisms with qn_targes will be cached. cache.clear() call_command("search_index", "--rebuild", "-f")
def test_get_results(self): """ Test our ability to collect the appropriate samples. """ sample = Sample() sample.accession_code = 'GSM45588' sample.save() result = ComputationalResult() result.save() computed_file1 = ComputedFile() computed_file1.filename = "oh_boy.txt" computed_file1.result = result computed_file1.size_in_bytes = 123 computed_file1.is_smashable = True computed_file1.save() computed_file2 = ComputedFile() computed_file2.filename = "gee_whiz.bmp" computed_file2.result = result computed_file2.size_in_bytes = 123 computed_file2.is_smashable = False computed_file2.save() assoc = SampleResultAssociation() assoc.sample = sample assoc.result = result assoc.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file1 assoc.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file2 assoc.save() computed_files = sample.get_result_files() self.assertEqual(computed_files.count(), 2)
def test_dualtech_smash(self): """ """ pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() # CROSS-SMASH BY SPECIES ds = Dataset() ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'STANDARD' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() self.assertTrue(ds.is_cross_technology()) final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 2) # THEN BY EXPERIMENT ds.aggregate_by = 'EXPERIMENT' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) os.remove(final_context['output_file']) self.assertEqual(len(final_context['final_frame'].columns), 1) # THEN BY ALL ds.aggregate_by = 'ALL' ds.save() dsid = ds.id ds = Dataset.objects.get(id=dsid) pj.start_time = None pj.end_time = None pj.save() final_context = smasher.smash(pj.pk, upload=False) self.assertTrue(os.path.exists(final_context['output_file'])) self.assertEqual(len(final_context['final_frame'].columns), 2)
def setUp(self): # Saving this for if we have protected endpoints # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword') # self.client.login(username='******', password='******') # self.user = User.objects.create(username="******") experiment = Experiment() experiment.accession_code = "GSE000" experiment.alternate_accession_code = "E-GEOD-000" experiment.title = "NONONONO" experiment.description = "Boooooourns. Wasabi." experiment.technology = "RNA-SEQ" experiment.save() experiment = Experiment() experiment.accession_code = "GSE123" experiment.title = "Hey Ho Let's Go" experiment.description = ( "This is a very exciting test experiment. Faygo soda. Blah blah blah." ) experiment.technology = "MICROARRAY" experiment.save() self.experiment = experiment experiment_annotation = ExperimentAnnotation() experiment_annotation.data = {"hello": "world", "123": 456} experiment_annotation.experiment = experiment experiment_annotation.save() # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below) for i in range(26): Organism(name=("TEST_ORGANISM_{}".format(i)), taxonomy_id=(1234 + i)).save() ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA", taxonomy_id=9646, is_scientific_name=True) ailuropoda.save() self.homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True) self.homo_sapiens.save() self.danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1337, is_scientific_name=True) self.danio_rerio.save() sample = Sample() sample.title = "123" sample.accession_code = "123" sample.is_processed = True sample.organism = ailuropoda sample.save() sample = Sample() sample.title = "789" sample.accession_code = "789" sample.is_processed = True sample.organism = ailuropoda sample.save() self.sample = sample # add qn target for sample organism result = ComputationalResult() result.commands.append("create_qn_target.py") result.is_ccdl = True result.is_public = True result.processor = None result.save() cra = ComputationalResultAnnotation() cra.result = result cra.data = {"organism_id": ailuropoda.id, "is_qn": True} cra.save() ailuropoda.qn_target = result ailuropoda.save() sample_annotation = SampleAnnotation() sample_annotation.data = {"goodbye": "world", "789": 123} sample_annotation.sample = sample sample_annotation.save() original_file = OriginalFile() original_file.save() original_file_sample_association = OriginalFileSampleAssociation() original_file_sample_association.sample = sample original_file_sample_association.original_file = original_file original_file_sample_association.save() downloader_job = DownloaderJob() downloader_job.save() download_assoc = DownloaderJobOriginalFileAssociation() download_assoc.original_file = original_file download_assoc.downloader_job = downloader_job download_assoc.save() processor_job = ProcessorJob() processor_job.save() processor_assoc = ProcessorJobOriginalFileAssociation() processor_assoc.original_file = original_file processor_assoc.processor_job = processor_job processor_assoc.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample experiment_sample_association.experiment = experiment experiment_sample_association.save() experiment.num_total_samples = 1 experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() processor = Processor() processor.name = "Salmon Quant" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() computational_result_short = ComputationalResult(processor=processor) computational_result_short.save() organism_index = OrganismIndex() organism_index.index_type = "TRANSCRIPTOME_SHORT" organism_index.organism = self.danio_rerio organism_index.result = computational_result_short organism_index.absolute_directory_path = ( "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT") organism_index.is_public = True organism_index.s3_url = "not_blank" organism_index.save() return
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487222_empty.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRS332914" sample2.title = "SRS332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRX332914": ["SRS332914"] } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertFalse(job.success) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_get_tximport_inputs(self): """"Tests that tximport only considers RNA-Seq samples from GEO. """ # Create one experiment and two related samples, based on: # https://www.ncbi.nlm.nih.gov/sra/?term=SRP040623 # (We don't need any original files because # get_tximport_inputs doesn't consider them.) experiment_accession = 'PRJNA242809' experiment = Experiment.objects.create(accession_code=experiment_accession) c_elegans = Organism.get_object_for_name("CAENORHABDITIS_ELEGANS") ## Sample 1 sample1_accession = 'SRR1206053' sample1 = Sample.objects.create(accession_code=sample1_accession, organism=c_elegans) sample1.source_database = 'GEO' sample1.technology = 'RNA-SEQ' ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample1) ## Sample 2 sample2_accession = 'SRR1206054' sample2 = Sample.objects.create(accession_code=sample2_accession, organism=c_elegans) sample2.source_database = 'GEO' sample2.technology = 'RNA-SEQ' ExperimentSampleAssociation.objects.create(experiment=experiment, sample=sample2) computational_result1 = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result1.save() sample_result_assoc = SampleResultAssociation(sample=sample1, result=computational_result1) sample_result_assoc.save() comp_file = ComputedFile() comp_file.absolute_file_path = "/doesnt/matter" comp_file.result = computational_result1 comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() computational_result2 = ComputationalResult(processor=utils.find_processor('SALMON_QUANT')) computational_result2.save() sample_result_assoc = SampleResultAssociation(sample=sample2, result=computational_result2) sample_result_assoc.save() comp_file = ComputedFile() comp_file.absolute_file_path = "/doesnt/matter" comp_file.result = computational_result2 comp_file.size_in_bytes=1337 comp_file.sha1="ABC" comp_file.s3_key = "key" comp_file.s3_bucket = "bucket" comp_file.save() quantified_experiments = salmon.get_tximport_inputs({"sample": sample1})['tximport_inputs'] self.assertEqual({}, quantified_experiments)
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS") sample = Sample() sample.accession_code = 'GSM1487313' sample.title = 'GSM1487313' sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = 'SRS332914' sample2.title = 'SRS332914' sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id)
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"]) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678")
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = "COMPENDIA" job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") micros = [] for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'): if 'microarray.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'): if 'rnaseq.txt' in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv' qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data['organism_id'] = danio_rerio.id cra.data['is_qn'] = True cra.result = result cra.save() dset = Dataset() dset.data = {'GSE1234': micros, 'GSE5678': rnas} dset.scale_by = 'NONE' dset.aggregate_by = 'SPECIES' dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual(len(final_context['computed_files']), 3) for file in final_context['computed_files']: self.assertTrue(os.path.exists(file.absolute_file_path))
def _run_salmontools(job_context: Dict) -> Dict: """ Run Salmontools to extract unmapped genes. """ logger.debug("Running SalmonTools ...") unmapped_filename = job_context[ "output_directory"] + "aux_info/unmapped_names.txt" command_str = "salmontools extract-unmapped -u {unmapped_file} -o {output} " output_prefix = job_context["salmontools_directory"] + "unmapped_by_salmon" command_str = command_str.format(unmapped_file=unmapped_filename, output=output_prefix) if "input_file_path_2" in job_context: command_str += "-1 {input_1} -2 {input_2}" command_str = command_str.format( input_1=job_context["input_file_path"], input_2=job_context["input_file_path_2"]) else: command_str += "-r {input_1}" command_str = command_str.format( input_1=job_context["input_file_path"]) start_time = timezone.now() logger.debug( "Running the following SalmonTools command: %s", command_str, processor_job=job_context["job_id"], ) completed_command = subprocess.run(command_str.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE) end_time = timezone.now() # As of SalmonTools 0.1.0, completed_command.returncode is always 0, # (even if error happens). completed_command.stderr is not totally # reliable either, because it will output the following line even # when the execution succeeds: # "There were <N> unmapped reads\n" # in which "<N>" is the number of lines in input unmapped_names.txt. # # As a workaround, we are using a regular expression here to test # the status of SalmonTools execution. Any text in stderr that is # not in the above format is treated as error message. status_str = completed_command.stderr.decode().strip() success_pattern = r"^There were \d+ unmapped reads$" if re.match(success_pattern, status_str): # Zip up the output of salmontools try: with tarfile.open(job_context["salmontools_archive"], "w:gz") as tar: tar.add(job_context["salmontools_directory"], arcname=os.sep) except Exception: logger.exception( "Exception caught while zipping processed directory %s", job_context["salmontools_directory"], processor_job=job_context["job_id"], ) failure_template = "Exception caught while zipping salmontools directory {}" job_context["job"].failure_reason = failure_template.format( job_context["salmontools_archive"]) job_context["success"] = False return job_context result = ComputationalResult() result.commands.append(command_str) result.time_start = start_time result.time_end = end_time result.is_ccdl = True try: processor_key = "SALMONTOOLS" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context["pipeline"].steps.append(result.id) assoc = SampleResultAssociation() assoc.sample = job_context["sample"] assoc.result = result assoc.save() computed_file = ComputedFile() computed_file.filename = job_context["salmontools_archive"].split( "/")[-1] computed_file.absolute_file_path = job_context["salmontools_archive"] computed_file.calculate_sha1() computed_file.calculate_size() computed_file.is_public = True computed_file.is_smashable = False computed_file.is_qc = True computed_file.result = result computed_file.save() job_context["computed_files"].append(computed_file) assoc = SampleComputedFileAssociation() assoc.sample = job_context["sample"] assoc.computed_file = computed_file assoc.save() job_context["result"] = result job_context["success"] = True else: # error in salmontools logger.error( "Shell call to salmontools failed with error message: %s", status_str, processor_job=job_context["job_id"], ) job_context["job"].failure_reason = ( "Shell call to salmontools failed because: " + status_str) job_context["success"] = False return job_context
def prepare_computed_files(): # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.num_processed_samples = 1 experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.is_processed = True sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.s3_key = "GSM1487313_liver.PCL" computed_file.s3_bucket = TEST_DATA_BUCKET computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRP332914" experiment2.num_processed_samples = 1 experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRR332914" sample2.title = "SRR332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.is_processed = True sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.s3_key = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.s3_bucket = TEST_DATA_BUCKET computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save()
def test_no_smash_dupe_two(self): """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file.""" job = ProcessorJob() job.pipeline_applied = "SMASHER" job.save() experiment = Experiment() experiment.accession_code = "SRP051449" experiment.save() result = ComputationalResult() result.save() danio_rerio = Organism.get_object_for_name("DANIO_RERIO") sample = Sample() sample.accession_code = 'SRR1731761' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'SRR1731762' sample.title = 'Danio rerio' sample.organism = danio_rerio sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() result = ComputationalResult() result.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']} ds.aggregate_by = 'SPECIES' ds.scale_by = 'NONE' ds.email_address = "*****@*****.**" ds.quantile_normalize = True ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = ds pjda.save() cr = ComputationalResult() cr.save() computed_file = ComputedFile() computed_file.filename = "danio_target.tsv" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = cr computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() cra = ComputationalResultAnnotation() cra.data = {'organism_id': danio_rerio.id, 'is_qn': True} cra.result = cr cra.save() final_context = smasher.smash(job.pk, upload=False) self.assertTrue(final_context['success'])
def _tximport(job_context: Dict, experiment: Experiment, quant_files: List[ComputedFile]) -> Dict: """Run tximport R script based on input quant files and the path of genes_to_transcripts.txt. """ # Download all the quant.sf fles for this experiment. Write all # their paths to a file so we can pass a path to that to # tximport.R rather than having to pass in one argument per # sample. tximport_path_list_file = job_context["work_dir"] + "tximport_inputs.txt" with open(tximport_path_list_file, "w") as input_list: for quant_file in quant_files: input_list.write(quant_file.get_synced_file_path() + "\n") rds_filename = "txi_out.RDS" rds_file_path = job_context["work_dir"] + rds_filename tpm_filename = "gene_lengthScaledTPM.tsv" tpm_file_path = job_context["work_dir"] + tpm_filename result = ComputationalResult() cmd_tokens = [ "/usr/bin/Rscript", "--vanilla", "/home/user/data_refinery_workers/processors/tximport.R", "--file_list", tximport_path_list_file, "--gene2txmap", job_context["genes_to_transcripts_path"], "--rds_file", rds_file_path, "--tpm_file", tpm_file_path ] result.time_start = timezone.now() logger.debug("Running tximport with: %s", str(cmd_tokens), processor_job=job_context['job_id'], experiment=experiment.id) try: tximport_result = subprocess.run(cmd_tokens, stdout=subprocess.PIPE, stderr=subprocess.PIPE) except Exception as e: error_template = ("Encountered error in R code while running tximport.R: {}") error_message = error_template.format(str(e)) logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id) job_context["job"].failure_reason = error_message job_context["success"] = False return job_context if tximport_result.returncode != 0: error_template = ("Found non-zero exit code from R code while running tximport.R: {}") error_message = error_template.format(tximport_result.stderr.decode().strip()) logger.error(error_message, processor_job=job_context["job_id"], experiment=experiment.id) job_context["job"].failure_reason = error_message job_context["success"] = False return job_context result.time_end = timezone.now() result.commands.append(" ".join(cmd_tokens)) result.is_ccdl = True try: processor_key = "TXIMPORT" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) # Associate this result with all samples in this experiment. # TODO: This may not be completely sensible, because `tximport` is # done at experiment level, not at sample level. # Could be very problematic if SRA's data model allows many # Experiments to one Run. # https://github.com/AlexsLemonade/refinebio/issues/297 for sample in experiment.samples.all(): s_r = SampleResultAssociation(sample=sample, result=result) s_r.save() rds_file = ComputedFile() rds_file.absolute_file_path = rds_file_path rds_file.filename = rds_filename rds_file.result = result rds_file.is_smashable = False rds_file.is_qc = False rds_file.is_public = True rds_file.calculate_sha1() rds_file.calculate_size() rds_file.save() job_context['computed_files'].append(rds_file) # Split the tximport result into smashable subfiles data = pd.read_csv(tpm_file_path, sep='\t', header=0, index_col=0) individual_files = [] frames = np.split(data, len(data.columns), axis=1) for frame in frames: # Create sample-specific TPM file. sample_file_name = frame.columns.values[0] + '_' + tpm_filename frame_path = os.path.join(job_context["work_dir"], sample_file_name) frame.to_csv(frame_path, sep='\t', encoding='utf-8') # The frame column header is based off of the path, which includes _output. sample = Sample.objects.get(accession_code=frame.columns.values[0].replace("_output", "")) computed_file = ComputedFile() computed_file.absolute_file_path = frame_path computed_file.filename = sample_file_name computed_file.result = result computed_file.is_smashable = True computed_file.is_qc = False computed_file.is_public = True computed_file.calculate_sha1() computed_file.calculate_size() computed_file.save() job_context['computed_files'].append(computed_file) job_context['smashable_files'].append(computed_file) SampleResultAssociation.objects.get_or_create( sample=sample, result=result) # Create association with the RDS file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=rds_file) # Create association with TPM file. SampleComputedFileAssociation.objects.get_or_create( sample=sample, computed_file=computed_file) individual_files.append(computed_file) job_context['samples'].append(sample) # Clean up quant.sf files that were created just for this. for quant_file in quant_files: quant_file.delete_s3_file() # It's only okay to delete the local file because the full # output directory has already been zipped up. quant_file.delete_local_file() quant_file.delete() # Salmon-processed samples aren't marked as is_processed # until they are fully tximported, this value sets that # for the end_job function. job_context['tximported'] = True job_context['individual_files'] = individual_files return job_context
def test_log2(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Has non-log2 data: # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421 # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz experiment = Experiment() experiment.accession_code = "GSE44421" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1084806' sample.title = 'GSM1084806' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084806-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1084807' sample.title = 'GSM1084807' sample.organism = homo_sapiens sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1084807-tbl-1.txt" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']} ds.aggregate_by = 'EXPERIMENT' ds.scale_by = 'MINMAX' ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertTrue(final_context['success'])
def test_bad_overlap(self): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "big.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "small.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() # Now, make sure the bad can't zero this out. sample = Sample() sample.accession_code = 'GSM999' sample.title = 'GSM999' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "bad.PCL" computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']} ds.aggregate_by = 'ALL' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'NONE' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() final_context = smasher.smash(pj.pk, upload=False) ds = Dataset.objects.get(id=ds.id) self.assertEqual(len(final_context['final_frame']), 4)
def test_dataset_adding_non_downloadable_samples_fails(self): # Make a sample that is not downloadable sample1 = Sample() sample1.title = "456" sample1.accession_code = "456" sample1.platform_name = "AFFY" sample1.is_processed = False sample1.organism = self.homo_sapiens sample1.save() experiment_sample_association = ExperimentSampleAssociation() experiment_sample_association.sample = sample1 experiment_sample_association.experiment = self.experiment experiment_sample_association.save() # Bad, 456 is not processed jdata = json.dumps({ "email_address": "*****@*****.**", "data": { "GSE123": ["456"] } }) response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), jdata, content_type="application/json", ) self.assertEqual(response.status_code, 400) self.assertIn( "Non-downloadable sample(s) in dataset", response.json()["message"], ) self.assertEqual(response.json()["details"], ["456"]) cache.clear() # Bad, 567 does not exist jdata = json.dumps({ "email_address": "*****@*****.**", "data": { "GSE123": ["567"] } }) response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), jdata, content_type="application/json", ) self.assertIn( "Sample(s) in dataset do not exist on refine", response.json()["message"], ) self.assertEqual(response.status_code, 400) cache.clear() # Good, 789 is processed jdata = json.dumps({ "email_address": "*****@*****.**", "data": { "GSE123": ["789"] } }) response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), jdata, content_type="application/json", ) self.assertEqual(response.status_code, 201) cache.clear() # Bad, 456 does not have a quant.sf file post_data = {"email_address": "*****@*****.**", "data": {}} response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), json.dumps(post_data), content_type="application/json", ) self.assertEqual(response.status_code, 201) cache.clear() put_data = { **post_data, "data": { "GSE123": ["456"] }, "quant_sf_only": True } response = self.client.put( reverse("dataset", kwargs={ "id": response.json()["id"], "version": API_VERSION }), json.dumps(put_data), content_type="application/json", ) self.assertEqual(response.status_code, 400) self.assertIn( "Sample(s) in dataset are missing quant.sf files", response.json()["message"], ) self.assertEqual(response.json()["details"], ["456"]) cache.clear() # Bad, none of the samples in GSE123 have a quant.sf file response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), json.dumps(post_data), content_type="application/json", ) self.assertEqual(response.status_code, 201) cache.clear() response = self.client.put( reverse("dataset", kwargs={ "id": response.json()["id"], "version": API_VERSION }), json.dumps({ **put_data, "data": { "GSE123": ["ALL"] } }), content_type="application/json", ) self.assertEqual(response.status_code, 400) self.assertIn( "Experiment(s) in dataset have zero downloadable samples", response.json()["message"], ) self.assertEqual(response.json()["details"], ["GSE123"]) cache.clear() # Make 456 have a quant.sf file result = ComputationalResult() result.save() sra = SampleResultAssociation() sra.sample = sample1 sra.result = result sra.save() computed_file = ComputedFile() computed_file.s3_key = "smasher-test-quant.sf" computed_file.s3_bucket = "data-refinery-test-assets" computed_file.filename = "quant.sf" computed_file.result = result computed_file.size_in_bytes = 42 computed_file.save() # Good, 456 does have a quant.sf file response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), json.dumps(post_data), content_type="application/json", ) self.assertEqual(response.status_code, 201) cache.clear() response = self.client.put( reverse("dataset", kwargs={ "id": response.json()["id"], "version": API_VERSION }), json.dumps(put_data), content_type="application/json", ) self.assertEqual(response.status_code, 200) cache.clear() # Good, a sample in GSE123 has a quant.sf file response = self.client.post( reverse("create_dataset", kwargs={"version": API_VERSION}), json.dumps(post_data), content_type="application/json", ) cache.clear() self.assertEqual(response.status_code, 201) response = self.client.put( reverse("dataset", kwargs={ "id": response.json()["id"], "version": API_VERSION }), json.dumps({ **put_data, "data": { "GSE123": ["ALL"] } }), content_type="application/json", ) self.assertEqual(response.status_code, 200)
def prepare_job(): pj = ProcessorJob() pj.pipeline_applied = "SMASHER" pj.save() experiment = Experiment() experiment.accession_code = "GSE51081" experiment.save() result = ComputationalResult() result.save() homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS") sample = Sample() sample.accession_code = 'GSM1237810' sample.title = 'GSM1237810' sample.organism = homo_sapiens sample.save() sample_annotation = SampleAnnotation() sample_annotation.data = {'hi': 'friend'} sample_annotation.sample = sample sample_annotation.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1237810_T09-1084.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sample = Sample() sample.accession_code = 'GSM1237812' sample.title = 'GSM1237812' sample.organism = homo_sapiens sample.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() computed_file = ComputedFile() computed_file.filename = "GSM1237812_S97-PURE.DAT" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = False computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() ds = Dataset() ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']} ds.aggregate_by = 'EXPERIMENT' # [ALL or SPECIES or EXPERIMENT] ds.scale_by = 'STANDARD' # [NONE or MINMAX or STANDARD or ROBUST] ds.email_address = "*****@*****.**" #ds.email_address = "*****@*****.**" ds.quantile_normalize = False ds.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = pj pjda.dataset = ds pjda.save() return pj
def test_make_experiment_result_associations(self): """Tests that the correct associations are made. The situation we're setting up is basically this: * tximport has been run for an experiment. * It made associations between the samples in the experiment and the ComputationalResult. * It didn't make associations between the experiment itself and the ComputationalResult. * There is a second experiment that hasn't had tximport run but shares a sample with the other experiment. * This second experiment has a sample which has not yet had tximport run on it. And what we're going to test for is: * An association is created between the tximport result and the first experiment. * An association is NOT created between the tximport result and the second experiment. """ # Get an organism to set on samples: homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS", taxonomy_id=9606) # Create the tximport processor and result: processor = Processor() processor.name = "Tximport" processor.version = "v9.9.9" processor.docker_image = "dr_salmon" processor.environment = '{"some": "environment"}' processor.save() result = ComputationalResult() result.commands.append("tximport invocation") result.is_ccdl = True result.processor = processor result.save() # Create the first experiment and it's samples: processed_experiment = Experiment() processed_experiment.accession_code = "SRP12345" processed_experiment.save() processed_sample_one = Sample() processed_sample_one.accession_code = "SRX12345" processed_sample_one.title = "SRX12345" processed_sample_one.organism = homo_sapiens processed_sample_one.save() sra = SampleResultAssociation() sra.sample = processed_sample_one sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_one esa.save() processed_sample_two = Sample() processed_sample_two.accession_code = "SRX12346" processed_sample_two.title = "SRX12346" processed_sample_two.organism = homo_sapiens processed_sample_two.save() sra = SampleResultAssociation() sra.sample = processed_sample_two sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = processed_experiment esa.sample = processed_sample_two esa.save() # Create the second experiment and it's additional sample. unprocessed_experiment = Experiment() unprocessed_experiment.accession_code = "SRP6789" unprocessed_experiment.save() unprocessed_sample = Sample() unprocessed_sample.accession_code = "SRX6789" unprocessed_sample.title = "SRX6789" unprocessed_sample.organism = homo_sapiens unprocessed_sample.save() sra = SampleResultAssociation() sra.sample = unprocessed_sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = unprocessed_sample esa.save() esa = ExperimentSampleAssociation() esa.experiment = unprocessed_experiment esa.sample = processed_sample_two esa.save() # Run the function we're testing: make_experiment_result_associations() # Test that only one association was created and that it was # to the processed experiment: eras = ExperimentResultAssociation.objects.all() self.assertEqual(len(eras), 1) self.assertEqual(eras.first().experiment, processed_experiment)
def _run_multiqc(job_context: Dict) -> Dict: """Runs the `MultiQC` package to generate the QC report. TODO: These seem to consume a lot of RAM, even for small files. We should consider tuning these or breaking them out into their own processors. JVM settings may reduce RAM footprint. """ command_str = ("multiqc {input_directory} --outdir {qc_directory} --zip-data-dir") formatted_command = command_str.format(input_directory=job_context["qc_input_directory"], qc_directory=job_context["qc_directory"]) logger.debug("Running MultiQC using the following shell command: %s", formatted_command, processor_job=job_context["job_id"]) qc_env = os.environ.copy() qc_env["LC_ALL"] = "C.UTF-8" qc_env["LANG"] = "C.UTF-8" time_start = timezone.now() completed_command = subprocess.run(formatted_command.split(), stdout=subprocess.PIPE, stderr=subprocess.PIPE, env=qc_env) time_end = timezone.now() if completed_command.returncode != 0: stderr = completed_command.stderr.decode().strip() error_start = stderr.upper().find("ERROR:") error_start = error_start if error_start != -1 else 0 logger.error("Shell call to MultiQC failed with error message: %s", stderr[error_start:], processor_job=job_context["job_id"]) job_context["job"].failure_reason = ("Shell call to MultiQC failed because: " + stderr[error_start:]) job_context["success"] = False result = ComputationalResult() result.commands.append(formatted_command) result.time_start = time_start result.time_end = time_end result.is_ccdl = True try: processor_key = "MULTIQC" result.processor = utils.find_processor(processor_key) except Exception as e: return utils.handle_processor_exception(job_context, processor_key, e) result.save() job_context['pipeline'].steps.append(result.id) assoc = SampleResultAssociation() assoc.sample = job_context["sample"] assoc.result = result assoc.save() job_context['qc_result'] = result data_file = ComputedFile() data_file.filename = "multiqc_data.zip" # This is deterministic data_file.absolute_file_path = os.path.join(job_context["qc_directory"], data_file.filename) data_file.calculate_sha1() data_file.calculate_size() data_file.is_public = True data_file.result = job_context['qc_result'] data_file.is_smashable = False data_file.is_qc = True data_file.save() job_context['computed_files'].append(data_file) SampleComputedFileAssociation.objects.get_or_create( sample=job_context["sample"], computed_file=data_file) report_file = ComputedFile() report_file.filename = "multiqc_report.html" # This is deterministic report_file.absolute_file_path = os.path.join(job_context["qc_directory"], report_file.filename) report_file.calculate_sha1() report_file.calculate_size() report_file.is_public = True report_file.is_smashable = False report_file.is_qc = True report_file.result = job_context['qc_result'] report_file.save() job_context['computed_files'].append(report_file) job_context['qc_files'] = [data_file, report_file] return job_context