def create_job_for_organism(organisms: List[Organism], svd_algorithm="ARPACK"): """Returns a compendia job for the provided organism. Fetch all of the experiments and compile large but normally formated Dataset. """ job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() dataset = Dataset() dataset.data = get_dataset(organisms) dataset.scale_by = "NONE" dataset.aggregate_by = "SPECIES" dataset.quantile_normalize = True dataset.quant_sf_only = False dataset.svd_algorithm = svd_algorithm dataset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dataset pjda.save() # Have to call this after setting the dataset since it's used in # the caclulation. job.ram_amount = determine_ram_amount(job) job.save() return job
def create_job_for_organism(organism: Organism): """Returns a quantpendia job for the provided organism.""" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() dset = Dataset() dset.data = build_dataset(organism) dset.scale_by = "NONE" dset.aggregate_by = "EXPERIMENT" dset.quantile_normalize = False dset.quant_sf_only = True dset.svd_algorithm = "NONE" dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() # Have to call this after setting the dataset since it's used in # the caclulation. job.ram_amount = determine_ram_amount(job) job.save() return job
def create_job_for_organism(organism: Organism): """Returns a quantpendia job for the provided organism.""" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value job.save() dset = Dataset() dset.data = build_dataset(organism) dset.scale_by = "NONE" dset.aggregate_by = "EXPERIMENT" dset.quantile_normalize = False dset.quant_sf_only = True dset.svd_algorithm = "NONE" dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() return job
def test_create_compendia(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1487313" experiment.save() result = ComputationalResult() result.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) sample = Sample() sample.accession_code = "GSM1487313" sample.title = "GSM1487313" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487313_liver.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = gallus_gallus sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = "GSM1487222_empty.PCL" computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL" computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRS332914" experiment2.save() result2 = ComputationalResult() result2.save() sample2 = Sample() sample2.accession_code = "SRS332914" sample2.title = "SRS332914" sample2.organism = gallus_gallus sample2.technology = "RNA-SEQ" sample2.save() sra2 = SampleResultAssociation() sra2.sample = sample2 sra2.result = result2 sra2.save() esa2 = ExperimentSampleAssociation() esa2.experiment = experiment2 esa2.sample = sample2 esa2.save() computed_file2 = ComputedFile() computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv" computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename computed_file2.result = result2 computed_file2.size_in_bytes = 234 computed_file2.is_smashable = True computed_file2.save() assoc2 = SampleComputedFileAssociation() assoc2.sample = sample2 assoc2.computed_file = computed_file2 assoc2.save() dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRX332914": ["SRS332914"] } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertFalse(job.success) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "MICROARRAY" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue sample = Sample() sample.accession_code = file sample.title = file sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() computed_file = ComputedFile() computed_file.filename = file computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file computed_file.result = result computed_file.size_in_bytes = 123 computed_file.is_smashable = True computed_file.save() assoc = SampleComputedFileAssociation() assoc.sample = sample assoc.computed_file = computed_file assoc.save() rnas.append(file) # Missing sample that will be filtered sample = Sample() sample.accession_code = "GSM1487222" sample.title = "this sample will be filtered" sample.organism = danio_rerio sample.technology = "RNASEQ" sample.save() sra = SampleResultAssociation() sra.sample = sample sra.result = result sra.save() esa = ExperimentSampleAssociation() esa.experiment = experiment esa.sample = sample esa.save() rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = False dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"]) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678")
def test_create_compendia(self): DATA_DIR = "/home/user/data_store/PCL/" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS", taxonomy_id=1001) # MICROARRAY TECH (experiment, _) = Experiment.objects.get_or_create(accession_code="GSE1487313") experiment.accession_code = "GSE1487313" experiment.save() create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "GSM1487313", "technology": "MICROARRAY", "filename": "GSM1487313_liver.PCL", "data_dir": DATA_DIR, }, experiment, ) # Missing sample that will be filtered create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "MICROARRAY", "filename": "GSM1487222_empty.PCL", "data_dir": DATA_DIR, }, experiment, ) # RNASEQ TECH experiment2 = Experiment() experiment2.accession_code = "SRP149598" experiment2.save() create_sample_for_experiment( { "organism": gallus_gallus, "accession_code": "SRR7250867", "technology": "RNA-SEQ", "filename": "SRP149598_gene_lengthScaledTPM.tsv", "data_dir": DATA_DIR, }, experiment, ) dset = Dataset() dset.data = { "GSE1487313": ["GSM1487313", "GSM1487222"], "SRP149598": ["SRR7250867"], } dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) # Because one of the samples is filtered out, there will be too few # remaining samples to smash together, so we expect this job to fail. self.assertFailed(job, "k must be between 1 and min(A.shape)") # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE1487313", )
def test_imputation(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "RNA-SEQ", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/", }, experiment, ) rnas.append(file) # Missing sample that will be filtered sample = create_sample_for_experiment( { "organism": danio_rerio, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "RNA-SEQ", "filename": None, }, experiment, ) rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() imputation_index = create_compendia.COMPENDIA_PIPELINE.index( create_compendia._perform_imputation) pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value) job_context = utils.run_pipeline( { "job_id": job.id, "pipeline": pipeline }, create_compendia.COMPENDIA_PIPELINE[:imputation_index], ) # First, run the imputation step without removing anything to get a baseline expected_context = utils.run_pipeline( job_context.copy(), [create_compendia.COMPENDIA_PIPELINE[imputation_index]]) # Now pick some rows to remove according to the instructions from # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336 random.seed(42) # Select some rows randomly and mask a little bit less than 30% of the values rare_rows = random.sample(list(job_context["microarray_matrix"].index), k=25) rare_genes = {} for row in rare_rows: cols = random.sample( list(job_context["microarray_matrix"].columns), # There are around 840 samples, and we want to pick a little bit # less than 30% of them k=int(0.28 * 840), ) rare_genes[row] = cols for col in cols: job_context["microarray_matrix"].loc[row, col] = np.nan # Now randomly select some entries from the other rows to mask individual_indices = random.sample( list( itertools.product( set(job_context["microarray_matrix"].index) - set(rare_rows), job_context["microarray_matrix"].columns, )), k=1000, ) for row, col in individual_indices: job_context["microarray_matrix"].loc[row, col] = np.nan final_context = utils.run_pipeline( job_context, [create_compendia.COMPENDIA_PIPELINE[imputation_index]]) self.assertDidNotFail(job) index = set(final_context["merged_no_qn"].index) & set( expected_context["merged_no_qn"].index) columns = set(final_context["merged_no_qn"].columns) & set( expected_context["merged_no_qn"].columns) # Calculate the Root-Mean-Square Error (RMSE) of the imputed values. # See https://en.wikipedia.org/wiki/Root-mean-square_deviation # for a description of the formula. N = 0 squared_error = 0 affected_entries = { *individual_indices, *((row, col) for row, cols in rare_genes.items() for col in cols), } for row, col in affected_entries: if row in index and col in columns: actual = final_context["merged_no_qn"].loc[row, col] expected = expected_context["merged_no_qn"].loc[row, col] N += 1 squared_error += (actual - expected)**2 rmse = math.sqrt(squared_error / N) # The results of a previous run plus a little bit of leeway self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
def test_drop_samples(self): """Make sure that we drop samples with >50% missing values""" job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1) danio_rerio.save() experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() samples = list(str(i) for i in range(0, 10)) for i in samples: create_sample_for_experiment( { "organism": danio_rerio, "accession_code": i, "technology": "MICROARRAY" }, experiment, ) dset = Dataset() dset.data = {"GSE1234": "ALL"} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() df = pd.DataFrame(columns=samples) for i in range(1, 101): row_i = {idx: i for idx in samples} if i % 3 != 0 and i % 3 != 1: del row_i["0"] if i % 2 != 0: del row_i["1"] if i % 3 != 0: del row_i["2"] if i % 4 != 0: del row_i["3"] df.loc[str(i)] = row_i job_context = { "microarray_matrix": df, "job": job, "dataset": dset, # This key is added in the setup code, so we need to add it ourselves here "filtered_samples": {}, } job_context = create_compendia._full_outer_join_gene_matrices( job_context) final_job_context = create_compendia._filter_rows_and_columns( job_context) filtered_matrix = final_job_context["row_col_filtered_matrix"] # Columns 0 and 1 have missing data, but they should still have >= 50%. # Columns 2 and 3 are both missing >50% though, so they should be filtered. self.assertEqual(set(filtered_matrix.columns), {"0", "1"} | {str(i) for i in range(4, 10)}) self.assertEqual(set(final_job_context["filtered_samples"].keys()), {"2", "3"}) for v in final_job_context["filtered_samples"].values(): self.assertIn("less than 50% present", v["reason"])
def test_create_compendia_microarray_only(self): """ Make sure that we can actually create a compendium with just microarray samples. """ job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) dset = Dataset() dset.data = {"GSE1234": micros} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertSucceeded(job) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"], ) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) zf = zipfile.ZipFile(final_context["compendium_result"].result. computedfile_set.first().absolute_file_path) with zf.open("aggregated_metadata.json") as f: metadata = json.load(f) self.assertFalse(metadata.get("quant_sf_only")) # 420 microarray self.assertEqual(metadata.get("num_samples"), 420) self.assertEqual(metadata.get("num_experiments"), 1) # Make sure the data were quantile normalized self.assertTrue(metadata.get("quantile_normalized")) self.assertIn("ks_statistic", final_context) self.assertIn("ks_pvalue", final_context) self.assertEqual(final_context["ks_pvalue"], 1.0)
def test_create_compendia_danio(self): job = ProcessorJob() job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value job.save() # MICROARRAY TECH experiment = Experiment() experiment.accession_code = "GSE1234" experiment.save() result = ComputationalResult() result.save() qn_target = ComputedFile() qn_target.filename = "danio_target.tsv" qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv" qn_target.is_qn_target = True qn_target.size_in_bytes = "12345" qn_target.sha1 = "aabbccddeeff" qn_target.result = result qn_target.save() danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result) danio_rerio.save() cra = ComputationalResultAnnotation() cra.data = {} cra.data["organism_id"] = danio_rerio.id cra.data["is_qn"] = True cra.result = result cra.save() result = ComputationalResult() result.save() micros = [] for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"): if "microarray.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "MICROARRAY", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/", }, experiment, ) micros.append(file) experiment = Experiment() experiment.accession_code = "GSE5678" experiment.save() result = ComputationalResult() result.save() rnas = [] for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"): if "rnaseq.txt" in file: continue create_sample_for_experiment( { "organism": danio_rerio, "accession_code": file, "technology": "RNA-SEQ", "filename": file, "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/", }, experiment, ) rnas.append(file) # Missing sample that will be filtered sample = create_sample_for_experiment( { "organism": danio_rerio, "accession_code": "GSM1487222", "title": "this sample will be filtered", "technology": "RNA-SEQ", "filename": None, }, experiment, ) rnas.append(sample.accession_code) dset = Dataset() dset.data = {"GSE1234": micros, "GSE5678": rnas} dset.scale_by = "NONE" dset.aggregate_by = "SPECIES" dset.svd_algorithm = "ARPACK" dset.quantile_normalize = True dset.save() pjda = ProcessorJobDatasetAssociation() pjda.processor_job = job pjda.dataset = dset pjda.save() final_context = create_compendia.create_compendia(job.id) self.assertSucceeded(job) # Verify result self.assertEqual( final_context["compendium_result"].result.computedfile_set.count(), 1) for file in final_context[ "compendium_result"].result.computedfile_set.all(): self.assertTrue(os.path.exists(file.absolute_file_path)) # test compendium_result self.assertEqual(final_context["compendium_result"].svd_algorithm, "ARPACK") self.assertEqual( final_context["compendium_result"].primary_organism.name, final_context["organism_name"], ) self.assertEqual( final_context["compendium_result"].primary_organism.name, "DANIO_RERIO") self.assertEqual(final_context["compendium_result"].organisms.count(), 1) self.assertEqual(len(final_context["filtered_samples"]), 10) # check that sample with no computed file was skipped self.assertTrue("GSM1487222" in final_context["filtered_samples"]) self.assertEqual( final_context["filtered_samples"]["GSM1487222"] ["experiment_accession_code"], "GSE5678") self.assertIn( "This sample did not have a processed file", final_context["filtered_samples"]["GSM1487222"]["reason"], ) # check that the 9 files with lots of missing measurements were filtered self.assertEqual( len( list( filter( lambda x: "less than 50% present values" in x["reason" ], final_context["filtered_samples"].values(), ))), 9, ) zf = zipfile.ZipFile(final_context["compendium_result"].result. computedfile_set.first().absolute_file_path) with zf.open("aggregated_metadata.json") as f: metadata = json.load(f) self.assertFalse(metadata.get("quant_sf_only")) self.assertEqual(metadata.get("compendium_version"), 1) # 420 microarray + 420 RNA seq # -1 that is filtered for a missing file # -9 that are filtered for having less than 50% present values self.assertEqual(metadata.get("num_samples"), 830) self.assertEqual(metadata.get("num_experiments"), 2) # Make sure the data were quantile normalized self.assertTrue(metadata.get("quantile_normalized")) self.assertIn("ks_statistic", final_context) self.assertIn("ks_pvalue", final_context) self.assertEqual(final_context["ks_pvalue"], 1.0)