예제 #1
0
def create_job_for_organism(organisms: List[Organism], svd_algorithm="ARPACK"):
    """Returns a compendia job for the provided organism.

    Fetch all of the experiments and compile large but normally formated Dataset.
    """
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
    job.save()

    dataset = Dataset()
    dataset.data = get_dataset(organisms)
    dataset.scale_by = "NONE"
    dataset.aggregate_by = "SPECIES"
    dataset.quantile_normalize = True
    dataset.quant_sf_only = False
    dataset.svd_algorithm = svd_algorithm
    dataset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    # Have to call this after setting the dataset since it's used in
    # the caclulation.
    job.ram_amount = determine_ram_amount(job)
    job.save()

    return job
예제 #2
0
def create_job_for_organism(organism: Organism):
    """Returns a quantpendia job for the provided organism."""
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
    job.save()

    dset = Dataset()
    dset.data = build_dataset(organism)
    dset.scale_by = "NONE"
    dset.aggregate_by = "EXPERIMENT"
    dset.quantile_normalize = False
    dset.quant_sf_only = True
    dset.svd_algorithm = "NONE"
    dset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dset
    pjda.save()

    # Have to call this after setting the dataset since it's used in
    # the caclulation.
    job.ram_amount = determine_ram_amount(job)
    job.save()

    return job
예제 #3
0
def create_job_for_organism(organism: Organism):
    """Returns a quantpendia job for the provided organism."""
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
    job.save()

    dset = Dataset()
    dset.data = build_dataset(organism)
    dset.scale_by = "NONE"
    dset.aggregate_by = "EXPERIMENT"
    dset.quantile_normalize = False
    dset.quant_sf_only = True
    dset.svd_algorithm = "NONE"
    dset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dset
    pjda.save()

    return job
예제 #4
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                     taxonomy_id=1001)

        sample = Sample()
        sample.accession_code = "GSM1487313"
        sample.title = "GSM1487313"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487222_empty.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL"
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = "SRS332914"
        sample2.title = "SRS332914"
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {
            "GSE1487313": ["GSM1487313", "GSM1487222"],
            "SRX332914": ["SRS332914"]
        }
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertFalse(job.success)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"],
            "GSE1487313",
        )
예제 #5
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = danio_rerio
        sample.technology = "RNASEQ"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"])
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"], "GSE5678")
예제 #6
0
    def test_create_compendia(self):
        DATA_DIR = "/home/user/data_store/PCL/"

        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                     taxonomy_id=1001)

        # MICROARRAY TECH
        (experiment,
         _) = Experiment.objects.get_or_create(accession_code="GSE1487313")
        experiment.accession_code = "GSE1487313"
        experiment.save()

        create_sample_for_experiment(
            {
                "organism": gallus_gallus,
                "accession_code": "GSM1487313",
                "technology": "MICROARRAY",
                "filename": "GSM1487313_liver.PCL",
                "data_dir": DATA_DIR,
            },
            experiment,
        )

        # Missing sample that will be filtered
        create_sample_for_experiment(
            {
                "organism": gallus_gallus,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "MICROARRAY",
                "filename": "GSM1487222_empty.PCL",
                "data_dir": DATA_DIR,
            },
            experiment,
        )

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRP149598"
        experiment2.save()

        create_sample_for_experiment(
            {
                "organism": gallus_gallus,
                "accession_code": "SRR7250867",
                "technology": "RNA-SEQ",
                "filename": "SRP149598_gene_lengthScaledTPM.tsv",
                "data_dir": DATA_DIR,
            },
            experiment,
        )

        dset = Dataset()
        dset.data = {
            "GSE1487313": ["GSM1487313", "GSM1487222"],
            "SRP149598": ["SRR7250867"],
        }
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Because one of the samples is filtered out, there will be too few
        # remaining samples to smash together, so we expect this job to fail.
        self.assertFailed(job, "k must be between 1 and min(A.shape)")

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"],
            "GSE1487313",
        )
예제 #7
0
    def test_imputation(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        imputation_index = create_compendia.COMPENDIA_PIPELINE.index(
            create_compendia._perform_imputation)

        pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
        job_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            create_compendia.COMPENDIA_PIPELINE[:imputation_index],
        )

        # First, run the imputation step without removing anything to get a baseline
        expected_context = utils.run_pipeline(
            job_context.copy(),
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])

        # Now pick some rows to remove according to the instructions from
        # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336

        random.seed(42)

        # Select some rows randomly and mask a little bit less than 30% of the values
        rare_rows = random.sample(list(job_context["microarray_matrix"].index),
                                  k=25)
        rare_genes = {}
        for row in rare_rows:
            cols = random.sample(
                list(job_context["microarray_matrix"].columns),
                # There are around 840 samples, and we want to pick a little bit
                # less than 30% of them
                k=int(0.28 * 840),
            )
            rare_genes[row] = cols
            for col in cols:
                job_context["microarray_matrix"].loc[row, col] = np.nan

        # Now randomly select some entries from the other rows to mask
        individual_indices = random.sample(
            list(
                itertools.product(
                    set(job_context["microarray_matrix"].index) -
                    set(rare_rows),
                    job_context["microarray_matrix"].columns,
                )),
            k=1000,
        )
        for row, col in individual_indices:
            job_context["microarray_matrix"].loc[row, col] = np.nan

        final_context = utils.run_pipeline(
            job_context,
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])
        self.assertDidNotFail(job)

        index = set(final_context["merged_no_qn"].index) & set(
            expected_context["merged_no_qn"].index)
        columns = set(final_context["merged_no_qn"].columns) & set(
            expected_context["merged_no_qn"].columns)

        # Calculate the Root-Mean-Square Error (RMSE) of the imputed values.
        # See https://en.wikipedia.org/wiki/Root-mean-square_deviation
        # for a description of the formula.

        N = 0
        squared_error = 0
        affected_entries = {
            *individual_indices,
            *((row, col) for row, cols in rare_genes.items() for col in cols),
        }
        for row, col in affected_entries:
            if row in index and col in columns:
                actual = final_context["merged_no_qn"].loc[row, col]
                expected = expected_context["merged_no_qn"].loc[row, col]

                N += 1
                squared_error += (actual - expected)**2

        rmse = math.sqrt(squared_error / N)

        # The results of a previous run plus a little bit of leeway
        self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
예제 #8
0
    def test_drop_samples(self):
        """Make sure that we drop samples with >50% missing values"""
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1)
        danio_rerio.save()

        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        samples = list(str(i) for i in range(0, 10))
        for i in samples:
            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": i,
                    "technology": "MICROARRAY"
                },
                experiment,
            )

        dset = Dataset()
        dset.data = {"GSE1234": "ALL"}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        df = pd.DataFrame(columns=samples)
        for i in range(1, 101):
            row_i = {idx: i for idx in samples}

            if i % 3 != 0 and i % 3 != 1:
                del row_i["0"]

            if i % 2 != 0:
                del row_i["1"]

            if i % 3 != 0:
                del row_i["2"]

            if i % 4 != 0:
                del row_i["3"]

            df.loc[str(i)] = row_i

        job_context = {
            "microarray_matrix": df,
            "job": job,
            "dataset": dset,
            # This key is added in the setup code, so we need to add it ourselves here
            "filtered_samples": {},
        }

        job_context = create_compendia._full_outer_join_gene_matrices(
            job_context)
        final_job_context = create_compendia._filter_rows_and_columns(
            job_context)

        filtered_matrix = final_job_context["row_col_filtered_matrix"]

        # Columns 0 and 1 have missing data, but they should still have >= 50%.
        # Columns 2 and 3 are both missing >50% though, so they should be filtered.
        self.assertEqual(set(filtered_matrix.columns),
                         {"0", "1"} | {str(i)
                                       for i in range(4, 10)})
        self.assertEqual(set(final_job_context["filtered_samples"].keys()),
                         {"2", "3"})
        for v in final_job_context["filtered_samples"].values():
            self.assertIn("less than 50% present", v["reason"])
예제 #9
0
    def test_create_compendia_microarray_only(self):
        """
        Make sure that we can actually create a compendium with just microarray samples.
        """
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        dset = Dataset()
        dset.data = {"GSE1234": micros}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertSucceeded(job)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"],
        )
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        zf = zipfile.ZipFile(final_context["compendium_result"].result.
                             computedfile_set.first().absolute_file_path)
        with zf.open("aggregated_metadata.json") as f:
            metadata = json.load(f)

            self.assertFalse(metadata.get("quant_sf_only"))
            # 420 microarray
            self.assertEqual(metadata.get("num_samples"), 420)
            self.assertEqual(metadata.get("num_experiments"), 1)

            # Make sure the data were quantile normalized
            self.assertTrue(metadata.get("quantile_normalized"))

        self.assertIn("ks_statistic", final_context)
        self.assertIn("ks_pvalue", final_context)
        self.assertEqual(final_context["ks_pvalue"], 1.0)
예제 #10
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertSucceeded(job)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"],
        )
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        self.assertEqual(len(final_context["filtered_samples"]), 10)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"], "GSE5678")
        self.assertIn(
            "This sample did not have a processed file",
            final_context["filtered_samples"]["GSM1487222"]["reason"],
        )

        # check that the 9 files with lots of missing measurements were filtered
        self.assertEqual(
            len(
                list(
                    filter(
                        lambda x: "less than 50% present values" in x["reason"
                                                                      ],
                        final_context["filtered_samples"].values(),
                    ))),
            9,
        )

        zf = zipfile.ZipFile(final_context["compendium_result"].result.
                             computedfile_set.first().absolute_file_path)
        with zf.open("aggregated_metadata.json") as f:
            metadata = json.load(f)

            self.assertFalse(metadata.get("quant_sf_only"))
            self.assertEqual(metadata.get("compendium_version"), 1)

            # 420 microarray + 420 RNA seq
            # -1 that is filtered for a missing file
            # -9 that are filtered for having less than 50% present values
            self.assertEqual(metadata.get("num_samples"), 830)

            self.assertEqual(metadata.get("num_experiments"), 2)

            # Make sure the data were quantile normalized
            self.assertTrue(metadata.get("quantile_normalized"))

        self.assertIn("ks_statistic", final_context)
        self.assertIn("ks_pvalue", final_context)
        self.assertEqual(final_context["ks_pvalue"], 1.0)