예제 #1
0
def create_qn_target(organism, platform, create_results=True):
    sample_codes_results = Sample.processed_objects.filter(
        platform_accession_code=platform,
        has_raw=True,
        technology="MICROARRAY",
        organism=organism,
        is_processed=True,
    ).values("accession_code")
    sample_codes = [res["accession_code"] for res in sample_codes_results]

    dataset = Dataset()
    dataset.data = {organism.name + "_(" + platform + ")": sample_codes}
    dataset.aggregate_by = "ALL"
    dataset.scale_by = "NONE"
    dataset.quantile_normalize = False
    dataset.save()

    job = ProcessorJob()
    job.pipeline_applied = "QN_REFERENCE"
    job.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    return qn_reference.create_qn_reference(job.pk, create_results=create_results)
예제 #2
0
    def test_notify(self):

        ds = Dataset()
        ds.data = {'GSM1487313': ['GSM1487313'], 'SRS332914': ['SRS332914']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        job_context = {}
        job_context['job'] = pj
        job_context['dataset'] = ds
        job_context['upload'] = True
        job_context[
            'result_url'] = 'https://s3.amazonaws.com/data-refinery-test-assets/all_the_things.jpg'

        final_context = smasher._notify(job_context)
        self.assertTrue(final_context.get('success', True))
예제 #3
0
def create_job_for_organism(organism: Organism):
    """Returns a quantpendia job for the provided organism."""
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
    job.save()

    dset = Dataset()
    dset.data = build_dataset(organism)
    dset.scale_by = "NONE"
    dset.aggregate_by = "EXPERIMENT"
    dset.quantile_normalize = False
    dset.quant_sf_only = True
    dset.svd_algorithm = "NONE"
    dset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dset
    pjda.save()

    # Have to call this after setting the dataset since it's used in
    # the caclulation.
    job.ram_amount = determine_ram_amount(job)
    job.save()

    return job
예제 #4
0
def create_job_for_organism(organisms: List[Organism], svd_algorithm="ARPACK"):
    """Returns a compendia job for the provided organism.

    Fetch all of the experiments and compile large but normally formated Dataset.
    """
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
    job.save()

    dataset = Dataset()
    dataset.data = get_dataset(organisms)
    dataset.scale_by = "NONE"
    dataset.aggregate_by = "SPECIES"
    dataset.quantile_normalize = True
    dataset.quant_sf_only = False
    dataset.svd_algorithm = svd_algorithm
    dataset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    # Have to call this after setting the dataset since it's used in
    # the caclulation.
    job.ram_amount = determine_ram_amount(job)
    job.save()

    return job
예제 #5
0
    def test_fail(self):
        """ Test our ability to fail """

        result = ComputationalResult()
        result.save()

        sample = Sample()
        sample.accession_code = 'XXX'
        sample.title = 'XXX'
        sample.organism = Organism.get_object_for_name("HOMO_SAPIENS")
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "NOT_REAL.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['XXX']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()
        dsid = ds.id

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)
        self.assertNotEqual(final_context['unsmashable_files'], [])
예제 #6
0
    def handle(self, *args, **options):
        """ For every (or a supplied) organism, fetch all of the experiments and compile large but normally formated Dataset.

        Send all of them to the Smasher. Smash them. Retrieve manually as desired.
        """

        dataset_ids = []

        if options["organism"] is None:
            all_organisms = Organism.objects.all()
        else:
            all_organisms = [
                Organism.get_object_for_name(options["organism"].upper())
            ]

        for organism in all_organisms:
            data = {}
            experiments = Experiment.objects.filter(
                id__in=(ExperimentOrganismAssociation.objects.filter(
                    organism=organism)).values('experiment'))
            for experiment in experiments:
                data[experiment.accession_code] = list(
                    experiment.samples.filter(organism=organism).values_list(
                        'accession_code', flat=True))

            job = ProcessorJob()
            job.pipeline_applied = "COMPENDIA"
            job.save()

            dset = Dataset()
            dset.data = data
            dset.scale_by = 'NONE'
            dset.aggregate_by = 'SPECIES'
            dset.quantile_normalize = False
            dset.save()

            pjda = ProcessorJobDatasetAssociation()
            pjda.processor_job = job
            pjda.dataset = dset
            pjda.save()

            final_context = create_compendia.create_compendia(job.id)

        sys.exit(0)
예제 #7
0
def create_job_for_organism(organism: Organism):
    """Returns a quantpendia job for the provided organism."""
    job = ProcessorJob()
    job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
    job.save()

    dset = Dataset()
    dset.data = build_dataset(organism)
    dset.scale_by = "NONE"
    dset.aggregate_by = "EXPERIMENT"
    dset.quantile_normalize = False
    dset.quant_sf_only = True
    dset.svd_algorithm = "NONE"
    dset.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dset
    pjda.save()

    return job
예제 #8
0
    def test_bad_overlap(self):

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {'hi': 'friend'}
        sample_annotation.sample = sample
        sample_annotation.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "big.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237812'
        sample.title = 'GSM1237812'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "small.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Now, make sure the bad can't zero this out.
        sample = Sample()
        sample.accession_code = 'GSM999'
        sample.title = 'GSM999'
        sample.organism = homo_sapiens
        sample.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        computed_file = ComputedFile()
        computed_file.filename = "bad.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/BADSMASH/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812', 'GSM999']}
        ds.aggregate_by = 'ALL'  # [ALL or SPECIES or EXPERIMENT]
        ds.scale_by = 'NONE'  # [NONE or MINMAX or STANDARD or ROBUST]
        ds.email_address = "*****@*****.**"
        #ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertEqual(len(final_context['final_frame']), 4)
예제 #9
0
def dispatch_qn_job_if_eligible(organism: Organism) -> None:
    """Checks if the organism is elgible for a QN job and if so dispatches it.

    An organism is eligible for a QN job if it has more than MIN
    samples on a single platform.
    """
    samples = Sample.processed_objects.filter(
        organism=organism,
        has_raw=True,
        technology="MICROARRAY",
        is_processed=True,
        platform_name__contains="Affymetrix",
    )

    if samples.count() < MIN:
        logger.info(
            "Total proccessed samples don't meet minimum threshhold",
            organism=organism,
            count=samples.count(),
            min=MIN,
        )
        return

    platform_counts = (
        samples.values("platform_accession_code")
        .annotate(dcount=Count("platform_accession_code"))
        .order_by("-dcount")
    )
    biggest_platform = platform_counts[0]["platform_accession_code"]

    sample_codes_results = Sample.processed_objects.filter(
        platform_accession_code=biggest_platform,
        has_raw=True,
        technology="MICROARRAY",
        organism=organism,
        is_processed=True,
    ).values("accession_code")

    if sample_codes_results.count() < MIN:
        logger.info(
            "Number of processed samples for largest platform didn't mean threshold.",
            organism=organism,
            platform_accession_code=biggest_platform,
            count=sample_codes_results.count(),
            min=MIN,
        )
        return

    sample_codes = [res["accession_code"] for res in sample_codes_results]

    dataset = Dataset()
    dataset.data = {organism.name + "_(" + biggest_platform + ")": sample_codes}
    dataset.aggregate_by = "ALL"
    dataset.scale_by = "NONE"
    dataset.quantile_normalize = False
    dataset.save()

    job = ProcessorJob()
    job.pipeline_applied = "QN_REFERENCE"
    job.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = job
    pjda.dataset = dataset
    pjda.save()

    logger.info("Sending QN_REFERENCE for Organism", job_id=str(job.pk), organism=str(organism))
    send_job(ProcessorPipeline.QN_REFERENCE, job)

    return job
예제 #10
0
    def test_create_compendia(self):
        DATA_DIR = "/home/user/data_store/PCL/"

        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                     taxonomy_id=1001)

        # MICROARRAY TECH
        (experiment,
         _) = Experiment.objects.get_or_create(accession_code="GSE1487313")
        experiment.accession_code = "GSE1487313"
        experiment.save()

        create_sample_for_experiment(
            {
                "organism": gallus_gallus,
                "accession_code": "GSM1487313",
                "technology": "MICROARRAY",
                "filename": "GSM1487313_liver.PCL",
                "data_dir": DATA_DIR,
            },
            experiment,
        )

        # Missing sample that will be filtered
        create_sample_for_experiment(
            {
                "organism": gallus_gallus,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "MICROARRAY",
                "filename": "GSM1487222_empty.PCL",
                "data_dir": DATA_DIR,
            },
            experiment,
        )

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRP149598"
        experiment2.save()

        create_sample_for_experiment(
            {
                "organism": gallus_gallus,
                "accession_code": "SRR7250867",
                "technology": "RNA-SEQ",
                "filename": "SRP149598_gene_lengthScaledTPM.tsv",
                "data_dir": DATA_DIR,
            },
            experiment,
        )

        dset = Dataset()
        dset.data = {
            "GSE1487313": ["GSM1487313", "GSM1487222"],
            "SRP149598": ["SRR7250867"],
        }
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Because one of the samples is filtered out, there will be too few
        # remaining samples to smash together, so we expect this job to fail.
        self.assertFailed(job, "k must be between 1 and min(A.shape)")

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"],
            "GSE1487313",
        )
예제 #11
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS",
                                                     taxonomy_id=1001)

        sample = Sample()
        sample.accession_code = "GSM1487313"
        sample.title = "GSM1487313"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487222_empty.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/doesnt_exists.PCL"
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = "SRS332914"
        sample2.title = "SRS332914"
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {
            "GSE1487313": ["GSM1487313", "GSM1487222"],
            "SRX332914": ["SRS332914"]
        }
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertFalse(job.success)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"],
            "GSE1487313",
        )
예제 #12
0
    def test_no_smash_dupe_two(self):
        """ Tests the SRP051449 case, where the titles collide. Also uses a real QN target file."""

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "SRP051449"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        sample = Sample()
        sample.accession_code = 'SRR1731761'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731761_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'SRR1731762'
        sample.title = 'Danio rerio'
        sample.organism = danio_rerio
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "SRR1731762_output_gene_lengthScaledTPM.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'SRP051449': ['SRR1731761', 'SRR1731762']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'NONE'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        cr = ComputationalResult()
        cr.save()

        computed_file = ComputedFile()
        computed_file.filename = "danio_target.tsv"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = cr
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = False
        computed_file.save()

        cra = ComputationalResultAnnotation()
        cra.data = {'organism_id': danio_rerio.id, 'is_qn': True}
        cra.result = cr
        cra.save()

        final_context = smasher.smash(job.pk, upload=False)
        self.assertTrue(final_context['success'])
예제 #13
0
    def test_dualtech_smash(self):
        """ """

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        # CROSS-SMASH BY SPECIES
        ds = Dataset()
        ds.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        self.assertTrue(ds.is_cross_technology())
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 2)

        # THEN BY EXPERIMENT
        ds.aggregate_by = 'EXPERIMENT'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        os.remove(final_context['output_file'])
        self.assertEqual(len(final_context['final_frame'].columns), 1)

        # THEN BY ALL
        ds.aggregate_by = 'ALL'
        ds.save()

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        pj.start_time = None
        pj.end_time = None
        pj.save()
        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(os.path.exists(final_context['output_file']))
        self.assertEqual(len(final_context['final_frame'].columns), 2)
예제 #14
0
def prepare_job():
    pj = ProcessorJob()
    pj.pipeline_applied = "SMASHER"
    pj.save()

    experiment = Experiment()
    experiment.accession_code = "GSE51081"
    experiment.save()

    result = ComputationalResult()
    result.save()

    homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

    sample = Sample()
    sample.accession_code = 'GSM1237810'
    sample.title = 'GSM1237810'
    sample.organism = homo_sapiens
    sample.save()

    sample_annotation = SampleAnnotation()
    sample_annotation.data = {'hi': 'friend'}
    sample_annotation.sample = sample
    sample_annotation.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237810_T09-1084.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sample = Sample()
    sample.accession_code = 'GSM1237812'
    sample.title = 'GSM1237812'
    sample.organism = homo_sapiens
    sample.save()

    esa = ExperimentSampleAssociation()
    esa.experiment = experiment
    esa.sample = sample
    esa.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    sra = SampleResultAssociation()
    sra.sample = sample
    sra.result = result
    sra.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.PCL"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = True
    computed_file.save()

    computed_file = ComputedFile()
    computed_file.filename = "GSM1237812_S97-PURE.DAT"
    computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
    computed_file.result = result
    computed_file.size_in_bytes = 123
    computed_file.is_smashable = False
    computed_file.save()

    assoc = SampleComputedFileAssociation()
    assoc.sample = sample
    assoc.computed_file = computed_file
    assoc.save()

    ds = Dataset()
    ds.data = {'GSE51081': ['GSM1237810', 'GSM1237812']}
    ds.aggregate_by = 'EXPERIMENT'  # [ALL or SPECIES or EXPERIMENT]
    ds.scale_by = 'STANDARD'  # [NONE or MINMAX or STANDARD or ROBUST]
    ds.email_address = "*****@*****.**"
    #ds.email_address = "*****@*****.**"
    ds.quantile_normalize = False
    ds.save()

    pjda = ProcessorJobDatasetAssociation()
    pjda.processor_job = pj
    pjda.dataset = ds
    pjda.save()

    return pj
예제 #15
0
    def test_no_smash_all_diff_species(self):
        """ Smashing together with 'ALL' with different species is a really weird behavior. 
        This test isn't really testing a normal case, just make sure that it's marking the
        unsmashable files.
        """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51084"
        experiment.save()

        mus_mus = Organism.get_object_for_name("MUS_MUSCULUS")

        sample = Sample()
        sample.accession_code = 'GSM1238108'
        sample.title = 'GSM1238108'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1238108-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810'], 'GSE51084': ['GSM1238108']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)
        print(ds.failure_reason)
        print(final_context['dataset'].failure_reason)

        self.assertEqual(final_context['unsmashable_files'], ['GSM1238108'])
예제 #16
0
    def test_qn_reference(self):
        # We don't have a 0.tsv
        experiment = prepare_experiment(range(1, 201))

        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False  # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["target_file"]))
        self.assertEqual(os.path.getsize(final_context["target_file"]), 562)

        homo_sapiens = Organism.objects.get(taxonomy_id=9606)
        target = homo_sapiens.qn_target.computedfile_set.latest()
        self.assertEqual(target.sha1,
                         "de69d348f8b239479e2330d596c4013a7b0b2b6a")

        # Create and run a smasher job that will use the QN target we just made.
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = "SPECIES"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context["success"])

        np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0],
                                       -0.4379488527774811)
        np.testing.assert_almost_equal(
            final_context["original_merged"]["1"][0], -0.5762109)

        # Make sure that the results were created. We create 200 computed files
        # and computational results (1 for each sample) plus the one generated
        # by the QN reference processor.
        self.assertEqual(ComputedFile.objects.all().count(), 200 + 1)
        self.assertEqual(ComputationalResult.objects.all().count(), 200 + 1)
        self.assertEqual(ComputationalResultAnnotation.objects.all().count(),
                         1)
예제 #17
0
    def handle(self, *args, **options):
        """
        """

        if not options["job_id"]:
            if options["organism"] is None and not options["all"]:
                logger.error("You must specify an organism or --all")
                sys.exit(1)

            if options["organism"] and (options.get("organism", "") != "ALL"):
                organisms = [
                    Organism.get_object_for_name(options["organism"].upper())
                ]
            else:
                organisms = Organism.objects.all()

            for organism in organisms:
                if not organism_can_have_qn_target(organism):
                    logger.error(
                        "Organism does not have any platform with enough samples to generate a qn target",
                        organism=organism,
                        min=options["min"],
                    )
                    continue

                samples = organism.sample_set.filter(has_raw=True,
                                                     technology="MICROARRAY",
                                                     is_processed=True)
                if samples.count() == 0:
                    logger.error(
                        "No processed samples for organism.",
                        organism=organism,
                        count=samples.count(),
                    )
                    continue

                if options["platform"] is None:
                    platform_counts = (
                        samples.values("platform_accession_code").annotate(
                            dcount=Count("platform_accession_code")).order_by(
                                "-dcount"))
                    biggest_platform = platform_counts[0][
                        "platform_accession_code"]
                else:
                    biggest_platform = options["platform"]

                sample_codes_results = Sample.processed_objects.filter(
                    platform_accession_code=biggest_platform,
                    has_raw=True,
                    technology="MICROARRAY",
                    organism=organism,
                    is_processed=True,
                ).values("accession_code")
                sample_codes = [
                    res["accession_code"] for res in sample_codes_results
                ]

                dataset = Dataset()
                dataset.data = {
                    organism.name + "_(" + biggest_platform + ")": sample_codes
                }
                dataset.aggregate_by = "ALL"
                dataset.scale_by = "NONE"
                dataset.quantile_normalize = False
                dataset.save()

                job = ProcessorJob()
                job.pipeline_applied = "QN_REFERENCE"
                job.save()

                pjda = ProcessorJobDatasetAssociation()
                pjda.processor_job = job
                pjda.dataset = dataset
                pjda.save()

                final_context = qn_reference.create_qn_reference(job.pk)

                if final_context["success"]:
                    print(":D")
                    self.stdout.write("Target file: " +
                                      final_context["target_file"])
                    self.stdout.write(
                        "Target S3: " +
                        str(final_context["computed_files"][0].get_s3_url()))
                else:
                    print(":(")
        else:
            qn_reference.create_qn_reference(options["job_id"])
예제 #18
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()

        for code in ['1', '2', '3', '4', '5', '6']:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = 'A-MEXP-1171'
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            file = ComputedFile()
            file.filename = code + ".tsv"
            file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            file.size_in_bytes = int(code)
            file.result = cr
            file.is_smashable = True
            file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        
        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context['success'])

        self.assertTrue(os.path.exists(final_context['target_file']))
        self.assertEqual(os.path.getsize(final_context['target_file']), 556)

        target = utils.get_most_recent_qn_target_for_organism(homo_sapiens)
        self.assertEqual(target.sha1, '636d72d5cbf4b9785b0bd271a1430b615feaa7ea')

        ###
        # Smasher with QN
        ###

        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = 'SPECIES'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context['success'])

        self.assertEqual(final_context['merged_qn']['1'][0], -0.4379488528812934)
        self.assertEqual(final_context['original_merged']['1'][0], -0.576210936113982)

        ## 
        # Test via management command
        ##

        from django.core.management import call_command
        from django.test import TestCase
        from django.utils.six import StringIO

        out = StringIO()
        try:
            call_command('create_qn_target', organism='homo_sapiens', min=1, stdout=out)
        except SystemExit as e: # this is okay!
            pass

        stdout = out.getvalue()
        self.assertTrue('Target file' in stdout)
        path = stdout.split('\n')[0].split(':')[1].strip()
        self.assertTrue(os.path.exists(path))
        self.assertEqual(path, utils.get_most_recent_qn_target_for_organism(homo_sapiens).absolute_file_path)
예제 #19
0
    def test_create_quantpendia(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_QUANTPENDIA.value
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51088"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS",
                                                    taxonomy_id=9606)

        sample = Sample()
        sample.accession_code = "GSM1237818"
        sample.title = "GSM1237818"
        sample.organism = homo_sapiens
        sample.technology = "RNA-SEQ"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.s3_key = "smasher-test-quant.sf"
        computed_file.s3_bucket = "data-refinery-test-assets"
        computed_file.filename = "quant.sf"
        computed_file.absolute_file_path = "/home/user/data_store/QUANT/smasher-test-quant.sf"
        computed_file.result = result
        computed_file.is_smashable = True
        computed_file.size_in_bytes = 123123
        computed_file.sha1 = (
            "08c7ea90b66b52f7cd9d9a569717a1f5f3874967"  # this matches with the downloaded file
        )
        computed_file.save()

        computed_file = ComputedFile()
        computed_file.filename = "logquant.tsv"
        computed_file.is_smashable = True
        computed_file.size_in_bytes = 123123
        computed_file.result = result
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {"GSE51088": ["GSM1237818"]}
        ds.aggregate_by = "EXPERIMENT"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quant_sf_only = True  # Make the dataset include quant.sf files only
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = create_quantpendia(job.id)

        self.assertTrue(
            os.path.exists(final_context["output_dir"] +
                           "/GSE51088/GSM1237818_quant.sf"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] + "/README.md"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] + "/LICENSE.TXT"))
        self.assertTrue(
            os.path.exists(final_context["output_dir"] +
                           "/aggregated_metadata.json"))

        self.assertTrue(final_context["metadata"]["quant_sf_only"])
        self.assertEqual(final_context["metadata"]["num_samples"], 1)
        self.assertEqual(final_context["metadata"]["num_experiments"], 1)

        # test that archive exists
        quantpendia_file = ComputedFile.objects.filter(
            is_compendia=True, quant_sf_only=True).latest()
        self.assertTrue(os.path.exists(quantpendia_file.absolute_file_path))
예제 #20
0
    def handle(self, *args, **options):
        """
        """

        if options["organism"] is None and not options["all"]:
            logger.error("You must specify an organism or --all")
            sys.exit(1)

        if options["organism"] and (options.get('organism', '') != "ALL"):
            organisms = [
                Organism.get_object_for_name(options["organism"].upper())
            ]
        else:
            organisms = Organism.objects.all()

        for organism in organisms:
            samples = Sample.processed_objects.filter(organism=organism,
                                                      has_raw=True,
                                                      technology="MICROARRAY",
                                                      is_processed=True)
            if samples.count() == 0:
                logger.error("No processed samples for organism.",
                             organism=organism,
                             count=samples.count())
                continue
            if samples.count() < options['min']:
                logger.error(
                    "Proccessed samples don't meet minimum threshhold",
                    organism=organism,
                    count=samples.count(),
                    min=options["min"])
                continue

            if options["platform"] is None:
                platform_counts = samples.values(
                    'platform_accession_code').annotate(dcount=Count(
                        'platform_accession_code')).order_by('-dcount')
                biggest_platform = platform_counts[0][
                    'platform_accession_code']
            else:
                biggest_platform = options["platform"]

            sample_codes_results = Sample.processed_objects.filter(
                platform_accession_code=biggest_platform,
                has_raw=True,
                technology="MICROARRAY",
                is_processed=True).values('accession_code')
            sample_codes = [
                res['accession_code'] for res in sample_codes_results
            ]

            dataset = Dataset()
            dataset.data = {
                organism.name + '_(' + biggest_platform + ')': sample_codes
            }
            dataset.aggregate_by = "ALL"
            dataset.scale_by = "NONE"
            dataset.quantile_normalize = False
            dataset.save()

            job = ProcessorJob()
            job.pipeline_applied = "QN_REFERENCE"
            job.save()

            pjda = ProcessorJobDatasetAssociation()
            pjda.processor_job = job
            pjda.dataset = dataset
            pjda.save()

            final_context = qn_reference.create_qn_reference(job.pk)

            if final_context['success']:
                print(":D")
                self.stdout.write("Target file: " +
                                  final_context['target_file'])
                self.stdout.write(
                    "Target S3: " +
                    str(final_context['computed_files'][0].get_s3_url()))
            else:
                print(":(")
예제 #21
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        danio_rerio = Organism.get_object_for_name("DANIO_RERIO")

        micros = []
        for file in os.listdir('/home/user/data_store/raw/TEST/MICROARRAY/'):

            if 'microarray.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir('/home/user/data_store/raw/TEST/RNASEQ/'):

            if 'rnaseq.txt' in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = '/home/user/data_store/QN/danio_target.tsv'
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data['organism_id'] = danio_rerio.id
        cra.data['is_qn'] = True
        cra.result = result
        cra.save()

        dset = Dataset()
        dset.data = {'GSE1234': micros, 'GSE5678': rnas}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(len(final_context['computed_files']), 3)
        for file in final_context['computed_files']:
            self.assertTrue(os.path.exists(file.absolute_file_path))
예제 #22
0
    def test_create_compendia(self):
        job = ProcessorJob()
        job.pipeline_applied = "COMPENDIA"
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1487313"
        experiment.save()

        result = ComputationalResult()
        result.save()

        gallus_gallus = Organism.get_object_for_name("GALLUS_GALLUS")

        sample = Sample()
        sample.accession_code = 'GSM1487313'
        sample.title = 'GSM1487313'
        sample.organism = gallus_gallus
        sample.technology = "MICROARRAY"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1487313_liver.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        # RNASEQ TECH
        experiment2 = Experiment()
        experiment2.accession_code = "SRS332914"
        experiment2.save()

        result2 = ComputationalResult()
        result2.save()

        sample2 = Sample()
        sample2.accession_code = 'SRS332914'
        sample2.title = 'SRS332914'
        sample2.organism = gallus_gallus
        sample2.technology = "RNA-SEQ"
        sample2.save()

        sra2 = SampleResultAssociation()
        sra2.sample = sample2
        sra2.result = result2
        sra2.save()

        esa2 = ExperimentSampleAssociation()
        esa2.experiment = experiment2
        esa2.sample = sample2
        esa2.save()

        computed_file2 = ComputedFile()
        computed_file2.filename = "SRP149598_gene_lengthScaledTPM.tsv"
        computed_file2.absolute_file_path = "/home/user/data_store/PCL/" + computed_file2.filename
        computed_file2.result = result2
        computed_file2.size_in_bytes = 234
        computed_file2.is_smashable = True
        computed_file2.save()

        assoc2 = SampleComputedFileAssociation()
        assoc2.sample = sample2
        assoc2.computed_file = computed_file2
        assoc2.save()

        dset = Dataset()
        dset.data = {'GSE1487313': ['GSM1487313'], 'SRX332914': ['SRS332914']}
        dset.scale_by = 'NONE'
        dset.aggregate_by = 'SPECIES'
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)
예제 #23
0
    def test_no_smash_dupe(self):
        """ """

        job = ProcessorJob()
        job.pipeline_applied = "SMASHER"
        job.save()

        experiment = Experiment()
        experiment.accession_code = "GSE51081"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1237810'
        sample.title = 'GSM1237810'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1237810_T09-1084.PCL"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1237811'
        sample.title = 'GSM1237811'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        result = ComputationalResult()
        result.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE51081': ['GSM1237810', 'GSM1237811']}
        ds.aggregate_by = 'ALL'
        ds.scale_by = 'STANDARD'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(job.pk, upload=False)

        dsid = ds.id
        ds = Dataset.objects.get(id=dsid)

        self.assertTrue(ds.success)
        for column in final_context['original_merged'].columns:
            self.assertTrue('_x' not in column)
예제 #24
0
    def test_qn_reference(self):
        job = ProcessorJob()
        job.pipeline_applied = "QN_REFERENCE"
        job.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606)
        homo_sapiens.save()

        experiment = Experiment()
        experiment.accession_code = "12345"
        experiment.save()
        # We don't have a 0.tsv
        codes = [str(i) for i in range(1, 201)]

        for code in codes:
            sample = Sample()
            sample.accession_code = code
            sample.title = code
            sample.platform_accession_code = "A-MEXP-1171"
            sample.manufacturer = "SLIPPERY DICK'S DISCOUNT MICROARRAYS"
            sample.organism = homo_sapiens
            sample.technology = "MICROARRAY"
            sample.is_processed = True
            sample.save()

            cr = ComputationalResult()
            cr.save()

            computed_file = ComputedFile()
            computed_file.filename = code + ".tsv"
            computed_file.absolute_file_path = "/home/user/data_store/QN/" + code + ".tsv"
            computed_file.size_in_bytes = int(code)
            computed_file.result = cr
            computed_file.is_smashable = True
            computed_file.save()

            scfa = SampleComputedFileAssociation()
            scfa.sample = sample
            scfa.computed_file = computed_file
            scfa.save()

            exsa = ExperimentSampleAssociation()
            exsa.experiment = experiment
            exsa.sample = sample
            exsa.save()

        dataset = Dataset()
        dataset.data = {"12345": ["1", "2", "3", "4", "5", "6"]}
        dataset.aggregate_by = "ALL"
        dataset.scale_by = "NONE"
        dataset.quantile_normalize = False  # We don't QN because we're creating the target now
        dataset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dataset
        pjda.save()

        final_context = qn_reference.create_qn_reference(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["target_file"]))
        self.assertEqual(os.path.getsize(final_context["target_file"]), 562)

        homo_sapiens.refresh_from_db()
        target = homo_sapiens.qn_target.computedfile_set.latest()
        self.assertEqual(target.sha1, "de69d348f8b239479e2330d596c4013a7b0b2b6a")

        # Create and run a smasher job that will use the QN target we just made.
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        ds = Dataset()
        ds.data = {"12345": ["1", "2", "3", "4", "5"]}
        ds.aggregate_by = "SPECIES"
        ds.scale_by = "STANDARD"
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = True
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        self.assertTrue(final_context["success"])

        np.testing.assert_almost_equal(final_context["merged_qn"]["1"][0], -0.4379488527774811)
        np.testing.assert_almost_equal(final_context["original_merged"]["1"][0], -0.5762109)
예제 #25
0
    def test_log2(self):
        pj = ProcessorJob()
        pj.pipeline_applied = "SMASHER"
        pj.save()

        # Has non-log2 data:
        # https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE44421
        # ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE44nnn/GSE44421/miniml/GSE44421_family.xml.tgz
        experiment = Experiment()
        experiment.accession_code = "GSE44421"
        experiment.save()

        result = ComputationalResult()
        result.save()

        homo_sapiens = Organism.get_object_for_name("HOMO_SAPIENS")

        sample = Sample()
        sample.accession_code = 'GSM1084806'
        sample.title = 'GSM1084806'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084806-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        sample = Sample()
        sample.accession_code = 'GSM1084807'
        sample.title = 'GSM1084807'
        sample.organism = homo_sapiens
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        computed_file = ComputedFile()
        computed_file.filename = "GSM1084807-tbl-1.txt"
        computed_file.absolute_file_path = "/home/user/data_store/PCL/" + computed_file.filename
        computed_file.result = result
        computed_file.size_in_bytes = 123
        computed_file.is_smashable = True
        computed_file.save()

        assoc = SampleComputedFileAssociation()
        assoc.sample = sample
        assoc.computed_file = computed_file
        assoc.save()

        ds = Dataset()
        ds.data = {'GSE44421': ['GSM1084806', 'GSM1084807']}
        ds.aggregate_by = 'EXPERIMENT'
        ds.scale_by = 'MINMAX'
        ds.email_address = "*****@*****.**"
        ds.quantile_normalize = False
        ds.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = pj
        pjda.dataset = ds
        pjda.save()

        final_context = smasher.smash(pj.pk, upload=False)
        ds = Dataset.objects.get(id=ds.id)

        self.assertTrue(final_context['success'])
예제 #26
0
    def test_drop_samples(self):
        """Make sure that we drop samples with >50% missing values"""
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1)
        danio_rerio.save()

        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        samples = list(str(i) for i in range(0, 10))
        for i in samples:
            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": i,
                    "technology": "MICROARRAY"
                },
                experiment,
            )

        dset = Dataset()
        dset.data = {"GSE1234": "ALL"}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        df = pd.DataFrame(columns=samples)
        for i in range(1, 101):
            row_i = {idx: i for idx in samples}

            if i % 3 != 0 and i % 3 != 1:
                del row_i["0"]

            if i % 2 != 0:
                del row_i["1"]

            if i % 3 != 0:
                del row_i["2"]

            if i % 4 != 0:
                del row_i["3"]

            df.loc[str(i)] = row_i

        job_context = {
            "microarray_matrix": df,
            "job": job,
            "dataset": dset,
            # This key is added in the setup code, so we need to add it ourselves here
            "filtered_samples": {},
        }

        job_context = create_compendia._full_outer_join_gene_matrices(
            job_context)
        final_job_context = create_compendia._filter_rows_and_columns(
            job_context)

        filtered_matrix = final_job_context["row_col_filtered_matrix"]

        # Columns 0 and 1 have missing data, but they should still have >= 50%.
        # Columns 2 and 3 are both missing >50% though, so they should be filtered.
        self.assertEqual(set(filtered_matrix.columns),
                         {"0", "1"} | {str(i)
                                       for i in range(4, 10)})
        self.assertEqual(set(final_job_context["filtered_samples"].keys()),
                         {"2", "3"})
        for v in final_job_context["filtered_samples"].values():
            self.assertIn("less than 50% present", v["reason"])
예제 #27
0
    def test_create_compendia_danio(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "MICROARRAY"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/MICROARRAY/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            sample = Sample()
            sample.accession_code = file
            sample.title = file
            sample.organism = danio_rerio
            sample.technology = "RNASEQ"
            sample.save()

            sra = SampleResultAssociation()
            sra.sample = sample
            sra.result = result
            sra.save()

            esa = ExperimentSampleAssociation()
            esa.experiment = experiment
            esa.sample = sample
            esa.save()

            computed_file = ComputedFile()
            computed_file.filename = file
            computed_file.absolute_file_path = "/home/user/data_store/raw/TEST/RNASEQ/" + file
            computed_file.result = result
            computed_file.size_in_bytes = 123
            computed_file.is_smashable = True
            computed_file.save()

            assoc = SampleComputedFileAssociation()
            assoc.sample = sample
            assoc.computed_file = computed_file
            assoc.save()

            rnas.append(file)

        # Missing sample that will be filtered
        sample = Sample()
        sample.accession_code = "GSM1487222"
        sample.title = "this sample will be filtered"
        sample.organism = danio_rerio
        sample.technology = "RNASEQ"
        sample.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        esa = ExperimentSampleAssociation()
        esa.experiment = experiment
        esa.sample = sample
        esa.save()

        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = False
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"])
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        # check that sample with no computed file was skipped
        self.assertTrue("GSM1487222" in final_context["filtered_samples"])
        self.assertEqual(
            final_context["filtered_samples"]["GSM1487222"]
            ["experiment_accession_code"], "GSE5678")
예제 #28
0
    def test_imputation(self):
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        experiment = Experiment()
        experiment.accession_code = "GSE5678"
        experiment.save()

        result = ComputationalResult()
        result.save()
        rnas = []
        for file in os.listdir("/home/user/data_store/raw/TEST/RNASEQ/"):

            if "rnaseq.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "RNA-SEQ",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/RNASEQ/",
                },
                experiment,
            )

            rnas.append(file)

        # Missing sample that will be filtered
        sample = create_sample_for_experiment(
            {
                "organism": danio_rerio,
                "accession_code": "GSM1487222",
                "title": "this sample will be filtered",
                "technology": "RNA-SEQ",
                "filename": None,
            },
            experiment,
        )
        rnas.append(sample.accession_code)

        dset = Dataset()
        dset.data = {"GSE1234": micros, "GSE5678": rnas}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        imputation_index = create_compendia.COMPENDIA_PIPELINE.index(
            create_compendia._perform_imputation)

        pipeline = Pipeline(name=PipelineEnum.CREATE_COMPENDIA.value)
        job_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            create_compendia.COMPENDIA_PIPELINE[:imputation_index],
        )

        # First, run the imputation step without removing anything to get a baseline
        expected_context = utils.run_pipeline(
            job_context.copy(),
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])

        # Now pick some rows to remove according to the instructions from
        # https://github.com/AlexsLemonade/refinebio/pull/2879#issuecomment-895143336

        random.seed(42)

        # Select some rows randomly and mask a little bit less than 30% of the values
        rare_rows = random.sample(list(job_context["microarray_matrix"].index),
                                  k=25)
        rare_genes = {}
        for row in rare_rows:
            cols = random.sample(
                list(job_context["microarray_matrix"].columns),
                # There are around 840 samples, and we want to pick a little bit
                # less than 30% of them
                k=int(0.28 * 840),
            )
            rare_genes[row] = cols
            for col in cols:
                job_context["microarray_matrix"].loc[row, col] = np.nan

        # Now randomly select some entries from the other rows to mask
        individual_indices = random.sample(
            list(
                itertools.product(
                    set(job_context["microarray_matrix"].index) -
                    set(rare_rows),
                    job_context["microarray_matrix"].columns,
                )),
            k=1000,
        )
        for row, col in individual_indices:
            job_context["microarray_matrix"].loc[row, col] = np.nan

        final_context = utils.run_pipeline(
            job_context,
            [create_compendia.COMPENDIA_PIPELINE[imputation_index]])
        self.assertDidNotFail(job)

        index = set(final_context["merged_no_qn"].index) & set(
            expected_context["merged_no_qn"].index)
        columns = set(final_context["merged_no_qn"].columns) & set(
            expected_context["merged_no_qn"].columns)

        # Calculate the Root-Mean-Square Error (RMSE) of the imputed values.
        # See https://en.wikipedia.org/wiki/Root-mean-square_deviation
        # for a description of the formula.

        N = 0
        squared_error = 0
        affected_entries = {
            *individual_indices,
            *((row, col) for row, cols in rare_genes.items() for col in cols),
        }
        for row, col in affected_entries:
            if row in index and col in columns:
                actual = final_context["merged_no_qn"].loc[row, col]
                expected = expected_context["merged_no_qn"].loc[row, col]

                N += 1
                squared_error += (actual - expected)**2

        rmse = math.sqrt(squared_error / N)

        # The results of a previous run plus a little bit of leeway
        self.assertLess(abs(rmse - 0.2868600293662542), 0.05)
예제 #29
0
    def handle(self, *args, **options):
        """ Dispatch QN_REFERENCE creation jobs for all Organisms with a platform with enough processed samples. """

        organisms = Organism.objects.all()

        for organism in organisms:
            samples = Sample.processed_objects.filter(
                organism=organism,
                has_raw=True,
                technology="MICROARRAY",
                is_processed=True,
                platform_name__contains="Affymetrix",
            )
            if samples.count() < MIN:
                logger.info(
                    "Total proccessed samples don't meet minimum threshhold",
                    organism=organism,
                    count=samples.count(),
                    min=MIN,
                )
                continue

            platform_counts = (
                samples.values("platform_accession_code").annotate(
                    dcount=Count("platform_accession_code")).order_by(
                        "-dcount"))
            biggest_platform = platform_counts[0]["platform_accession_code"]

            sample_codes_results = Sample.processed_objects.filter(
                platform_accession_code=biggest_platform,
                has_raw=True,
                technology="MICROARRAY",
                organism=organism,
                is_processed=True,
            ).values("accession_code")

            if sample_codes_results.count() < MIN:
                logger.info(
                    "Number of processed samples for largest platform didn't mean threshold.",
                    organism=organism,
                    platform_accession_code=biggest_platform,
                    count=sample_codes_results.count(),
                    min=MIN,
                )
                continue

            sample_codes = [
                res["accession_code"] for res in sample_codes_results
            ]

            dataset = Dataset()
            dataset.data = {
                organism.name + "_(" + biggest_platform + ")": sample_codes
            }
            dataset.aggregate_by = "ALL"
            dataset.scale_by = "NONE"
            dataset.quantile_normalize = False
            dataset.save()

            job = ProcessorJob()
            job.pipeline_applied = "QN_REFERENCE"
            job.save()

            pjda = ProcessorJobDatasetAssociation()
            pjda.processor_job = job
            pjda.dataset = dataset
            pjda.save()

            logger.info("Sending QN_REFERENCE for Organism",
                        job_id=str(job.pk),
                        organism=str(organism))
            send_job(ProcessorPipeline.QN_REFERENCE, job)
예제 #30
0
    def test_create_compendia_microarray_only(self):
        """
        Make sure that we can actually create a compendium with just microarray samples.
        """
        job = ProcessorJob()
        job.pipeline_applied = ProcessorPipeline.CREATE_COMPENDIA.value
        job.save()

        # MICROARRAY TECH
        experiment = Experiment()
        experiment.accession_code = "GSE1234"
        experiment.save()

        result = ComputationalResult()
        result.save()

        qn_target = ComputedFile()
        qn_target.filename = "danio_target.tsv"
        qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
        qn_target.is_qn_target = True
        qn_target.size_in_bytes = "12345"
        qn_target.sha1 = "aabbccddeeff"
        qn_target.result = result
        qn_target.save()

        danio_rerio = Organism(name="DANIO_RERIO",
                               taxonomy_id=1,
                               qn_target=result)
        danio_rerio.save()

        cra = ComputationalResultAnnotation()
        cra.data = {}
        cra.data["organism_id"] = danio_rerio.id
        cra.data["is_qn"] = True
        cra.result = result
        cra.save()

        result = ComputationalResult()
        result.save()

        micros = []
        for file in os.listdir("/home/user/data_store/raw/TEST/MICROARRAY/"):

            if "microarray.txt" in file:
                continue

            create_sample_for_experiment(
                {
                    "organism": danio_rerio,
                    "accession_code": file,
                    "technology": "MICROARRAY",
                    "filename": file,
                    "data_dir": "/home/user/data_store/raw/TEST/MICROARRAY/",
                },
                experiment,
            )

            micros.append(file)

        dset = Dataset()
        dset.data = {"GSE1234": micros}
        dset.scale_by = "NONE"
        dset.aggregate_by = "SPECIES"
        dset.svd_algorithm = "ARPACK"
        dset.quantile_normalize = True
        dset.save()

        pjda = ProcessorJobDatasetAssociation()
        pjda.processor_job = job
        pjda.dataset = dset
        pjda.save()

        final_context = create_compendia.create_compendia(job.id)

        self.assertSucceeded(job)

        # Verify result
        self.assertEqual(
            final_context["compendium_result"].result.computedfile_set.count(),
            1)
        for file in final_context[
                "compendium_result"].result.computedfile_set.all():
            self.assertTrue(os.path.exists(file.absolute_file_path))

        # test compendium_result
        self.assertEqual(final_context["compendium_result"].svd_algorithm,
                         "ARPACK")
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            final_context["organism_name"],
        )
        self.assertEqual(
            final_context["compendium_result"].primary_organism.name,
            "DANIO_RERIO")
        self.assertEqual(final_context["compendium_result"].organisms.count(),
                         1)

        zf = zipfile.ZipFile(final_context["compendium_result"].result.
                             computedfile_set.first().absolute_file_path)
        with zf.open("aggregated_metadata.json") as f:
            metadata = json.load(f)

            self.assertFalse(metadata.get("quant_sf_only"))
            # 420 microarray
            self.assertEqual(metadata.get("num_samples"), 420)
            self.assertEqual(metadata.get("num_experiments"), 1)

            # Make sure the data were quantile normalized
            self.assertTrue(metadata.get("quantile_normalized"))

        self.assertIn("ks_statistic", final_context)
        self.assertIn("ks_pvalue", final_context)
        self.assertEqual(final_context["ks_pvalue"], 1.0)