Пример #1
0
    def test_calls_survey(self, mock_get):
        """If source_type is supported calls the appropriate survey method."""
        mock_get.side_effect = mocked_requests_get

        # Prevent a call being made to NCBI's API to determine
        # organism name/id.
        organism = Organism(name="H**O SAPIENS", taxonomy_id=9606, is_scientific_name=True)
        organism.save()

        survey_job = SurveyJob(source_type="ARRAY_EXPRESS")
        survey_job.save()
        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value="E-GEOD-22166")
        key_value_pair.save()

        surveyor.run_job(survey_job)
        logger.info("Started Survey Job %d, waiting for it to complete.", survey_job.id)
        survey_job = wait_for_job(survey_job, SurveyJob)
        self.assertTrue(survey_job.success)

        batch = Batch.objects.all()[0]
        batch = Batch.objects.filter(survey_job=survey_job).get()

        downloader_job = batch.downloaderjob_set.get()
        logger.info("Survey Job finished, waiting for Downloader Job %d to complete.",
                    downloader_job.id)
        downloader_job = wait_for_job(downloader_job, DownloaderJob)
        self.assertTrue(downloader_job.success)

        processor_job = batch.processorjob_set.get()
        logger.info("Downloader Job finished, waiting for processor Job %d to complete.",
                    processor_job.id)
        processor_job = wait_for_job(processor_job, ProcessorJob)
        self.assertTrue(processor_job.success)
Пример #2
0
    def test_illumina_no_pvalue(self):
        """This experiment should fail because it has no p-value columns, so
        make sure it fails at that stage of the processing"""
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41355/suppl/GSE41355%5Fnon%2Dnormalized%2Etxt%2Egz",
            "filename":
            "GSE41355_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE41355_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM1015436", "IRF3/7 DKO 2"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)

        self.assertFailed(pj, "Could not detect PValue column!")
Пример #3
0
    def test_illumina_rows_starting_with_whitespace(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112517/suppl/GSE112517_non-normalized.txt.gz",
            "filename":
            "GSE112517_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE112517_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM3071991",
                    "MCF-7 KLHDC7B siRNA knockdown control",
                    {
                        "description": ["SAMPLE 1"],
                    },
                ),
                (
                    "GSM3071992",
                    "MCF-7 KLHDC7B siRNA knockdown",
                    {
                        "description": ["SAMPLE 2"],
                    },
                ),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
Пример #4
0
    def test_processor_and_organism_in_sample(self):
        sample = Sample.objects.create(accession_code="ACCESSION", title="fake sample")
        homo_sapiens = Organism(name="HOMO_SAPIENS", taxonomy_id=9606, is_scientific_name=True)
        homo_sapiens.save()
        transcriptome_result = ComputationalResult.objects.create()
        organism_index = OrganismIndex.objects.create(
            organism=homo_sapiens, result=transcriptome_result, index_type="TRANSCRIPTOME_LONG"
        )
        result = ComputationalResult.objects.create(
            processor=self.salmon_quant_proc, organism_index=organism_index
        )
        SampleResultAssociation.objects.create(sample=sample, result=result)

        response = self.client.get(
            reverse(
                "samples_detail",
                kwargs={"accession_code": sample.accession_code, "version": API_VERSION},
            )
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        processor = response.json()["results"][0]["processor"]
        self.assertEqual(processor["name"], self.salmon_quant_proc.name)
        self.assertEqual(
            processor["environment"]["os_pkg"]["python3"],
            self.salmon_quant_proc.environment["os_pkg"]["python3"],
        )

        organism_index = response.json()["results"][0]["organism_index"]
        self.assertEqual(organism_index["result_id"], transcriptome_result.id)
        self.assertEqual(organism_index["index_type"], "TRANSCRIPTOME_LONG")
Пример #5
0
 def setUp(self):
     # Insert human organism into the database so the model doesn't call the
     # taxonomy API to populate it.
     organism = Organism(name="HOMO_SAPIENS",
                         taxonomy_id=9606,
                         is_scientific_name=True)
     organism.save()
Пример #6
0
    def test_illumina_id_ref_column_with_whitespace(self):
        """This test case tests the issue brought up in
        https://github.com/alexslemonade/refinebio/issues/1560
        where an ID_REF column would not be detected because the column name had a trailing space
        """

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100301/suppl/GSE100301%5Fnon%2Dnormalized%2Etxt%2Egz",
            "filename":
            "GSE100301_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE100301_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM2677583",
                    "22Rv1-tetO-Gal4, replicate 1",
                    {
                        "description": ["SAMPLE 1"],
                    },
                ),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
Пример #7
0
    def test_illumina_to_pcl(self):
        """Most basic Illumina to PCL test"""

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        # Remove the title of one of the samples to make sure that we can still
        # find its detection column using the description given as an annotation
        sample = Sample.objects.get(title="LV-T350A&si-EZH2-3")
        sample.title = "ignoreme_for_description"
        sample.accession_code = "ignoreme_for_description"
        sample.save()

        final_context = illumina.illumina_to_pcl(job.pk, cleanup=False)
        self.assertSucceeded(job)

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Пример #8
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """
        from data_refinery_workers.processors import illumina

        pj = ProcessorJob()
        pj.pipeline_applied = "ILLUMINA_TO_PCL"
        pj.save()

        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz"
        og_file.filename = "GSE54661_non_normalized.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = pj
        assoc1.save()

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        sample = Sample()
        sample.accession_code = "ABCD-1234"
        sample.title = "hypoxia_Signal"
        sample.organism = organism
        sample.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

        final_context = illumina.illumina_to_pcl(pj.pk)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Пример #9
0
    def test_bad_illumina_detection(self):
        """With the wrong species, this will fail the platform detection threshold."""

        organism = Organism(name="RATTUS_NORVEGICUS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})
        final_context = illumina.illumina_to_pcl(job.pk, cleanup=False)
        self.assertTrue(final_context["abort"])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Пример #10
0
    def test_convert_processed_illumina(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # Reporter Identifier VALUE   Detection Pval
        # ILMN_1343291    14.943602   0
        # ILMN_1343295    13.528082   0
        og_file = OriginalFile()
        og_file.source_filename = "https://www.ebi.ac.uk/arrayexpress/experiments/E-GEOD-22433/"
        og_file.filename = "GSM557500_sample_table.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM557500_sample_table.txt")
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000156508 14.943602
        # ENSG00000111640 13.528082
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         920374)
        self.assertTrue(
            no_op.check_output_quality(final_context["output_file_path"]))
Пример #11
0
    def test_convert_illumina_no_header(self):
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931
        # ILMN_2209417    10.0000 0.2029
        # ILMN_1765401    152.0873    0.0000
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1.txt"
        )
        og_file.filename = "GSM1089291-tbl-1.txt"
        og_file.absolute_file_path = "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1.txt"
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        # To:
        # ENSG00000105675 10
        # ENSG00000085721 152.0873
        # ENSG00000278494 152.0873
        final_context = no_op.no_op_processor(job.pk)
        self.assertTrue(final_context["success"])
        self.assertTrue(os.path.exists(final_context["output_file_path"]))
        self.assertEqual(os.path.getsize(final_context["output_file_path"]),
                         786207)
Пример #12
0
    def test_convert_illumina_bad_cols(self):
        """
        In future, this test may be deprecated. For now it just alerts that it needs attention.
        """
        job = ProcessorJob()
        job.pipeline_applied = "NO_OP"
        job.save()

        # ex:
        # ILMN_1885639    10.0000 0.7931  11.0000 0.123
        # ILMN_2209417    10.0000 0.2029  11.1234 0.543
        # LMN_1765401    152.0873    0.0000  99.999  0.19
        og_file = OriginalFile()
        og_file.source_filename = (
            "https://github.com/AlexsLemonade/refinebio/files/2255178/GSM1089291-tbl-1-modified.txt"
        )
        og_file.filename = "GSM1089291-tbl-1-modified.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/NO_OP/GSM1089291-tbl-1-modified.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()

        sample = Sample()
        sample.accession_code = "GSM557500"
        sample.title = "GSM557500"
        sample.platform_accession_code = "A-MEXP-1171"
        sample.manufacturer = "ILLUMINA"
        sample.organism = homo_sapiens
        sample.save()

        assoc = OriginalFileSampleAssociation()
        assoc.original_file = og_file
        assoc.sample = sample
        assoc.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = job
        assoc1.save()

        final_context = no_op.no_op_processor(job.pk)
        self.assertFalse(final_context["success"])
        self.assertTrue("Tell Rich!" in final_context["job"].failure_reason)
Пример #13
0
    def setUp(self):
        survey_job = SurveyJob(source_type="SRA")
        survey_job.save()
        self.survey_job = survey_job

        key_value_pair = SurveyJobKeyValue(survey_job=survey_job,
                                           key="experiment_accession_code",
                                           value="DRR002116")
        key_value_pair.save()

        # Insert the organism into the database so the model doesn't call the
        # taxonomy API to populate it.
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()
Пример #14
0
def get_organism_with_qn_target():
    result = ComputationalResult()
    result.save()

    qn_target = ComputedFile()
    qn_target.filename = "danio_target.tsv"
    qn_target.absolute_file_path = "/home/user/data_store/QN/danio_target.tsv"
    qn_target.is_qn_target = True
    qn_target.size_in_bytes = "12345"
    qn_target.sha1 = "aabbccddeeff"
    qn_target.result = result
    qn_target.save()

    danio_rerio = Organism(name="DANIO_RERIO", taxonomy_id=1, qn_target=result)
    danio_rerio.save()
    return danio_rerio
Пример #15
0
    def test_unmated_reads(self):
        """Survey, download, then process a sample we know is SRA and has unmated reads.

        This test uses VCR to remove the dependence upon NCBI's
        servers, but the downloader job hits ENA's FTP and aspera
        servers. Unfortunately there's not much that can be done to
        avoid that behavior from here because the downloader jobs
        always check ENA's FTP server to see if the file has an
        unmated read. For now we'll just have to be content with the
        fact that NCBI going down won't affect this test.
        """
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            # Survey just a single run to make things faster!
            # This sample has unmated reads!
            survey_job = surveyor.survey_experiment("SRR1603661", "SRA")

            self.assertTrue(survey_job.success)

            # Let's give the downloader a little bit to get started
            # and to update the OriginalFiles' source_urls.
            time.sleep(60)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)
            downloader_job = downloader_jobs.first()

            self.assertIsNotNone(downloader_job.start_time)

            for original_file in downloader_job.original_files.all():
                self.assertTrue(".fastq.gz" in original_file.source_url)

            # The downloader job will take a while to complete. Let's not wait.
            print(downloader_job.kill_nomad_job())
Пример #16
0
    def test_survey(self, mock_get, mock_urlopen, mock_send_job):
        json_file_path = os.path.join(os.path.dirname(__file__),
                                      "test_transcriptome_species.json")
        with open(json_file_path, "r") as json_file:
            species_json = json.load(json_file)

        # Insert the organisms into the database so the model doesn't call the
        # taxonomy API to populate them.
        for species in species_json:
            # Account for the subtle difference between the API for
            # the main Ensembl division and the API for the rest of
            # them.
            name_key = "common_name" if "common_name" in species else "name"
            taxonomy_key = "taxonomy_id" if "taxonomy_id" in species else "taxon_id"
            organism = Organism(name=species[name_key].upper(),
                                taxonomy_id=species[taxonomy_key],
                                is_scientific_name=True)
            organism.save()

        mock_get.return_value = Mock(ok=True)
        mock_get.return_value.json.return_value = species_json

        # There are two possible file locations. The correct one is
        # determined by making a request to one to see if it
        # exists. This URLError simulates it not existing.
        mock_urlopen.side_effect = URLError("404 or something")

        surveyor = TranscriptomeIndexSurveyor(self.survey_job)
        surveyor.survey()

        downloader_jobs = DownloaderJob.objects.order_by("id").all()
        self.assertEqual(downloader_jobs.count(), len(species_json))
        send_job_calls = []
        for downloader_job in downloader_jobs:
            send_job_calls.append(
                call(Downloaders.TRANSCRIPTOME_INDEX, downloader_job.id))

        mock_send_job.assert_has_calls(send_job_calls)

        # There should be 2 Batches for each species (long and short
        # transcriptome lengths).
        batches = Batch.objects.all()
        self.assertEqual(batches.count(), len(species_json) * 2)
        # And each batch has two files: fasta and gtf
        for batch in batches:
            self.assertEqual(len(batch.files), 2)
Пример #17
0
class APITestCases(APITestCase):
    def setUp(self):
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

    def tearDown(self):
        Organism.objects.all().delete()

    def test_qn_endpoints(self):
        # create two qn endpoints

        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {
            "organism_id": self.danio_rerio.id,  # Danio
            "is_qn": True,
            "platform_accession_code": "zebrafish",
            "samples": [],
            "geneset": str(["RWWJ000001", "RWWJ000002"]),
        }
        cra.save()
        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {
            "organism_id": self.homo_sapiens.id,  # IDK
            "is_qn": True,
            "platform_accession_code": "zebrafishplusone",
            "samples": [],
            "geneset": str(["RWWJ000003", "RWWJ000004"]),
        }
        cra.save()

        self.homo_sapiens.qn_target = result
        self.homo_sapiens.save()
        self.danio_rerio.qn_target = result
        self.danio_rerio.save()

        response = self.client.get(
            reverse("qn_targets_available", kwargs={"version": API_VERSION}))

        self.assertEqual(len(response.json()), 2)
Пример #18
0
    def test_illumina_to_pcl(self):
        """ Most basic Illumina to PCL test """
        from data_refinery_workers.processors import illumina

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job(organism)
        final_context = illumina.illumina_to_pcl(job.pk)

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Пример #19
0
def prepare_job(job_info: dict) -> ProcessorJob:
    job = ProcessorJob()
    job.pipeline_applied = "NO_OP"
    job.save()

    og_file = OriginalFile()
    og_file.source_filename = job_info["source_filename"]
    og_file.filename = job_info["filename"]
    og_file.absolute_file_path = job_info["absolute_file_path"]
    og_file.is_downloaded = True
    og_file.save()

    sample = Sample()
    sample.accession_code = job_info["accession_code"]
    sample.title = job_info["accession_code"]
    sample.platform_accession_code = job_info["platform_accession_code"]

    manufacturer = job_info.get("manufacturer", None)
    if manufacturer is not None:
        sample.manufacturer = manufacturer

    # The illumina samples need the human organism
    if manufacturer == "ILLUMINA":
        homo_sapiens = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
        homo_sapiens.save()
        sample.organism = homo_sapiens

    sample.save()

    assoc = OriginalFileSampleAssociation()
    assoc.original_file = og_file
    assoc.sample = sample
    assoc.save()

    assoc1 = ProcessorJobOriginalFileAssociation()
    assoc1.original_file = og_file
    assoc1.processor_job = job
    assoc1.save()

    return job
Пример #20
0
    def test_detect_columns(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        pipeline = Pipeline(name=PipelineEnum.ILLUMINA.value)

        final_context = utils.run_pipeline(
            {
                "job_id": job.id,
                "pipeline": pipeline
            },
            [
                utils.start_job,
                illumina._prepare_files,
                illumina._detect_encoding,
                illumina._sanitize_input_file,
                illumina._convert_sanitized_to_tsv,
                illumina._detect_columns,
            ],
        )

        self.assertNotEqual(final_context.get("success"), False)

        # For this experiment, the probe ID is the first column
        self.assertEqual(final_context.get("probeId"), GSE22427_HEADER[0])

        expected_column_ids = ",".join(
            map(
                lambda t: str(t[0]),
                filter(
                    # For this header file, the samples all have the prefix LV-
                    lambda t: t[1].startswith("LV-"),
                    # We use start=1 here because the column IDs are formatted
                    # for R code so they treat the header as a 1-indexed list
                    enumerate(GSE22427_HEADER, start=1),
                ),
            ))
        self.assertEqual(final_context.get("columnIds"), expected_column_ids)
Пример #21
0
    def test_illumina_quoted_row_names(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE33nnn/GSE33814/suppl/GSE33814%5Fnon%2Dnormalized%2Etxt%2Egz",
            # Some of the columns are trimmed to save space and time
            "filename":
            "GSE33814_trimmed_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE33814_trimmed_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM836222", "IMGUS_32"),
                ("GSM836223", "IMGUS_33"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)

        # Make sure that the row names are no longer quoted after sanitizing the file
        def assertNotQuoted(string: str):
            string = string.strip()
            self.assertNotEqual(string[0], '"')
            self.assertNotEqual(string[-1], '"')

        with open(final_context["sanitized_file_path"], "r") as f:
            reader = csv.reader(f, delimiter="\t")

            headers = next(reader)
            for header in headers:
                assertNotQuoted(header)

            # Also make sure the probe IDs aren't qutoed
            first_row = next(reader)
            assertNotQuoted(first_row[0])
Пример #22
0
    def test_illumina_space_separated(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE48nnn/GSE48023/suppl/GSE48023%5Fnon%2Dnormalized%2Etxt%2Egz",
            # Some of the columns are trimmed to save space and time
            "filename":
            "GSE48023_trimmed_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE48023_trimmed_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM1165512", "WholeBloodRNA_IN0242_Day0"),
                ("GSM1165513", "WholeBloodRNA_IN0242_Day1"),
                ("GSM1165514", "WholeBloodRNA_IN0242_Day14"),
                ("GSM1165515", "WholeBloodRNA_IN0242_Day3"),
                ("GSM1165516", "WholeBloodRNA_IN0243_Day0"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)

        # Assert that the sanitized file is tab-separated (by reading it as a
        # TSV and making sure it has 11 headers) and has an extra ID_REF header
        with open(final_context["sanitized_file_path"], "r") as f:
            reader = csv.reader(f, delimiter="\t")

            headers = next(reader)

            # ID_REF + 5 observations + 5 p-values
            self.assertEqual(len(headers), 11)
            self.assertEqual(headers[0], "ID_REF")
Пример #23
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works."""

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz",
            "filename":
            "GSE54661_non_normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt",
            "organism":
            organism,
            "samples": [("ABCD-1234", "CB CD34+ hypoxia"),
                        ("ABCD-1235", "CB CD34+ normoxia")],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Пример #24
0
    def test_no_op(self):
        """Survey, download, then process an experiment we know is NO_OP."""
        # Clear out pre-existing work dirs so there's no conflicts:

        self.env = EnvironmentVarGuard()
        self.env.set('RUNING_IN_CLOUD', 'False')
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Make sure there are no already existing jobs we might poll for unsuccessfully.
            DownloaderJobOriginalFileAssociation.objects.all().delete()
            DownloaderJob.objects.all().delete()
            ProcessorJobOriginalFileAssociation.objects.all().delete()
            ProcessorJob.objects.all().delete()

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            accession_code = "E-GEOD-3303"
            survey_job = surveyor.survey_experiment(accession_code,
                                                    "ARRAY_EXPRESS")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertGreater(downloader_jobs.count(), 0)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            processor_jobs = ProcessorJob.objects.all()
            self.assertGreater(processor_jobs.count(), 0)

            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            start_time = timezone.now()
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                self.assertTrue(processor_job.success)

            # Test that the unsurveyor deletes all objects related to the experiment
            purge_experiment(accession_code)

            self.assertEqual(Experiment.objects.all().count(), 0)
            self.assertEqual(ExperimentAnnotation.objects.all().count(), 0)
            self.assertEqual(ExperimentSampleAssociation.objects.all().count(),
                             0)
            self.assertEqual(Sample.objects.all().count(), 0)
            self.assertEqual(SampleAnnotation.objects.all().count(), 0)
            self.assertEqual(OriginalFile.objects.all().count(), 0)
            self.assertEqual(
                OriginalFileSampleAssociation.objects.all().count(), 0)
            self.assertEqual(SampleResultAssociation.objects.all().count(), 0)
            self.assertEqual(ComputationalResult.objects.all().count(), 0)
            self.assertEqual(
                ComputationalResultAnnotation.objects.all().count(), 0)
            self.assertEqual(
                SampleComputedFileAssociation.objects.all().count(), 0)
            self.assertEqual(ComputedFile.objects.all().count(), 0)
            self.assertEqual(DownloaderJob.objects.all().count(), 0)
            self.assertEqual(
                DownloaderJobOriginalFileAssociation.objects.all().count(), 0)
            self.assertEqual(ProcessorJob.objects.all().count(), 0)
            self.assertEqual(
                ProcessorJobOriginalFileAssociation.objects.all().count(), 0)
Пример #25
0
    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below)
        for i in range(26):
            Organism(name=("TEST_ORGANISM_{}".format(i)),
                     taxonomy_id=(1234 + i)).save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return
Пример #26
0
class APITestCases(APITestCase):
    def setUp(self):
        # Saving this for if we have protected endpoints
        # self.superuser = User.objects.create_superuser('john', '*****@*****.**', 'johnpassword')
        # self.client.login(username='******', password='******')
        # self.user = User.objects.create(username="******")

        experiment = Experiment()
        experiment.accession_code = "GSE000"
        experiment.alternate_accession_code = "E-GEOD-000"
        experiment.title = "NONONONO"
        experiment.description = "Boooooourns. Wasabi."
        experiment.technology = "RNA-SEQ"
        experiment.save()

        experiment = Experiment()
        experiment.accession_code = "GSE123"
        experiment.title = "Hey Ho Let's Go"
        experiment.description = (
            "This is a very exciting test experiment. Faygo soda. Blah blah blah."
        )
        experiment.technology = "MICROARRAY"
        experiment.save()
        self.experiment = experiment

        experiment_annotation = ExperimentAnnotation()
        experiment_annotation.data = {"hello": "world", "123": 456}
        experiment_annotation.experiment = experiment
        experiment_annotation.save()

        # Create 26 test organisms numbered 0-25 for pagination test, so there should be 29 organisms total (with the 3 others below)
        for i in range(26):
            Organism(name=("TEST_ORGANISM_{}".format(i)),
                     taxonomy_id=(1234 + i)).save()

        ailuropoda = Organism(name="AILUROPODA_MELANOLEUCA",
                              taxonomy_id=9646,
                              is_scientific_name=True)
        ailuropoda.save()
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

        sample = Sample()
        sample.title = "123"
        sample.accession_code = "123"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()

        sample = Sample()
        sample.title = "789"
        sample.accession_code = "789"
        sample.is_processed = True
        sample.organism = ailuropoda
        sample.save()
        self.sample = sample

        # add qn target for sample organism
        result = ComputationalResult()
        result.commands.append("create_qn_target.py")
        result.is_ccdl = True
        result.is_public = True
        result.processor = None
        result.save()

        cra = ComputationalResultAnnotation()
        cra.result = result
        cra.data = {"organism_id": ailuropoda.id, "is_qn": True}
        cra.save()

        ailuropoda.qn_target = result
        ailuropoda.save()

        sample_annotation = SampleAnnotation()
        sample_annotation.data = {"goodbye": "world", "789": 123}
        sample_annotation.sample = sample
        sample_annotation.save()

        original_file = OriginalFile()
        original_file.save()

        original_file_sample_association = OriginalFileSampleAssociation()
        original_file_sample_association.sample = sample
        original_file_sample_association.original_file = original_file
        original_file_sample_association.save()

        downloader_job = DownloaderJob()
        downloader_job.save()

        download_assoc = DownloaderJobOriginalFileAssociation()
        download_assoc.original_file = original_file
        download_assoc.downloader_job = downloader_job
        download_assoc.save()

        processor_job = ProcessorJob()
        processor_job.save()

        processor_assoc = ProcessorJobOriginalFileAssociation()
        processor_assoc.original_file = original_file
        processor_assoc.processor_job = processor_job
        processor_assoc.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()
        experiment.num_total_samples = 1
        experiment.num_processed_samples = 1
        experiment.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        result = ComputationalResult()
        result.save()

        sra = SampleResultAssociation()
        sra.sample = sample
        sra.result = result
        sra.save()

        processor = Processor()
        processor.name = "Salmon Quant"
        processor.version = "v9.9.9"
        processor.docker_image = "dr_salmon"
        processor.environment = '{"some": "environment"}'
        processor.save()

        computational_result_short = ComputationalResult(processor=processor)
        computational_result_short.save()

        organism_index = OrganismIndex()
        organism_index.index_type = "TRANSCRIPTOME_SHORT"
        organism_index.organism = self.danio_rerio
        organism_index.result = computational_result_short
        organism_index.absolute_directory_path = (
            "/home/user/data_store/salmon_tests/TRANSCRIPTOME_INDEX/SHORT")
        organism_index.is_public = True
        organism_index.s3_url = "not_blank"
        organism_index.save()

        return

    def tearDown(self):
        """ Good bye """
        Experiment.objects.all().delete()
        ExperimentAnnotation.objects.all().delete()
        Sample.objects.all().delete()
        SampleAnnotation.objects.all().delete()

    def test_all_endpoints(self):
        response = self.client.get(
            reverse("experiments", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response["X-Source-Revision"],
                         get_env_variable("SYSTEM_VERSION"))

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"ids": str(self.sample.id) + ",1000"},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"accession_codes": str(self.sample.accession_code) + ",1000"},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}) +
            "HOMO_SAPIENS/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("platforms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("institutions", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("survey_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("downloader_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        # Don't know the best way to deal with this, but since the other tests in different files
        # create objects which are then deleted, the new objects from these tests will have different
        # IDs. In this case, since this file is ran first, the IDs are 1, but this may be a problem
        # in the future.
        response = self.client.get(
            reverse("downloader_jobs", kwargs={"version": API_VERSION}) +
            "1/"  # change back
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("processor_jobs", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("processor_jobs", kwargs={"version": API_VERSION}) + "1/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("stats", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("results", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("results", kwargs={"version": API_VERSION}) + "1/")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("schema_redoc", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}) +
            "?organism__name=DANIO_RERIO")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}) +
            "?result_id=1")
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("transcriptome_indices", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)

        response = self.client.get(
            reverse("create_dataset", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code,
                         status.HTTP_405_METHOD_NOT_ALLOWED)

    def test_experiment_multiple_accessions(self):
        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}) +
            "?accession_code=GSE000&accession_code=GSE123",
            follow=True,
        )

        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 2)

    # Test the query the front-end uses to find the experiment with a given
    # accession or alternate accession
    def test_experiment_alternate_accession(self):
        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}) +
            "?search=alternate_accession_code:E-GEOD-000" +
            "?search=accession_code:E-GEOD-000",
            follow=True,
        )

        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 1)
        self.assertEqual(
            response.json()["results"][0]["alternate_accession_code"],
            "E-GEOD-000")

    def test_sample_multiple_accessions(self):
        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}) +
            "?accession_codes=123,789",
            follow=True,
        )

        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 2)

    def test_sample_pagination(self):
        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 2)

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}), {"limit": 1})
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 1)

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}), {
                "limit": 1,
                "ordering": "-title"
            })
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.json()["results"][0]["title"], "789")

        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}), {
                "limit": 1,
                "ordering": "title"
            })
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.json()["results"][0]["title"], "123")

    def test_organism_pagination(self):
        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 25)

        # First organism on second page should be TEST_ORGANISM_25, and since 29 organisms have been created, there should be 4 on the 2nd page
        response = self.client.get(
            reverse("organisms", kwargs={"version": API_VERSION}),
            {"offset": 25})
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 4)
        self.assertEqual(response.json()["results"][0]["name"],
                         "TEST_ORGANISM_25")

    def test_fetching_experiment_samples(self):
        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"experiment_accession_code": self.experiment.accession_code},
        )
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(len(response.json()["results"]), 1)
        self.assertEqual(response.json()["results"][0]["accession_code"],
                         "789")

        # Expect 404 if the experiment accession code isn't valid
        response = self.client.get(
            reverse("samples", kwargs={"version": API_VERSION}),
            {"experiment_accession_code": "wrong-accession-code"},
        )
        self.assertEqual(response.status_code, 404)

    def test_sample_detail_experiment_accessions(self):
        response = self.client.get(
            reverse("samples_detail",
                    kwargs={
                        "version": API_VERSION,
                        "accession_code": "789"
                    }))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.json()["experiment_accession_codes"],
                         ["GSE123"])

    def test_fetching_organism_index(self):
        organism_index_id = OrganismIndex.objects.all().first().id
        response = self.client.get(
            reverse(
                "transcriptome_indices_read",
                kwargs={
                    "id": organism_index_id,
                    "version": API_VERSION
                },
            ))
        self.assertEqual(response.status_code, status.HTTP_200_OK)
        self.assertEqual(response.json()["index_type"], "TRANSCRIPTOME_SHORT")

        # Expect 404 if the transcriptome index id is not valid
        response = self.client.get(
            reverse("transcriptome_indices_read",
                    kwargs={
                        "id": 0,
                        "version": API_VERSION
                    }))
        self.assertEqual(response.status_code, 404)

    def test_processed_samples_only(self):
        """ Don't return unprocessed samples """
        experiment = Experiment()
        experiment.accession_code = "GSX12345"
        experiment.is_public = True
        experiment.save()

        sample = Sample()
        sample.title = "I am unprocessed"
        sample.accession_code = "GSXUnprocessed"
        sample.is_processed = False
        sample.save()

        experiment_sample_association = ExperimentSampleAssociation()
        experiment_sample_association.sample = sample
        experiment_sample_association.experiment = experiment
        experiment_sample_association.save()

        # we return all experiments
        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}),
            {"search": "GSX12345"})
        self.assertEqual(response.json()["count"], 1)

        # check requesting only experiments with processed samples
        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}),
            {
                "search": "GSX12345",
                "num_processed_samples__gt": 0
            },
        )
        self.assertEqual(response.json()["count"], 0)

        sample2 = Sample()
        sample2.title = "I am processed"
        sample2.accession_code = "GSXProcessed"
        sample2.is_processed = True
        sample2.save()

        experiment_sample2_association = ExperimentSampleAssociation()
        experiment_sample2_association.sample = sample2
        experiment_sample2_association.experiment = experiment
        experiment_sample2_association.save()

        # update cached values
        experiment.num_total_samples = 2
        experiment.num_processed_samples = 1
        experiment.save()

        response = self.client.get(
            reverse("search", kwargs={"version": API_VERSION}),
            {"search": "GSX12345"})
        self.assertEqual(response.json()["count"], 1)

        self.assertEqual(len(experiment.processed_samples), 1)

        experiment.delete()
        sample.delete()
        sample2.delete()

    def test_create_token(self):
        # First, try activating right away
        response = self.client.post(
            reverse("token", kwargs={"version": API_VERSION}),
            json.dumps({"is_activated": True}),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 201)
        token_id = response.json()["id"]

        # Now activate using a second request
        response = self.client.post(reverse("token",
                                            kwargs={"version": API_VERSION}),
                                    content_type="application/json")
        self.assertEqual(response.status_code, 201)
        token = response.json()
        token["is_activated"] = True
        token_id = token["id"]
        response = self.client.put(
            reverse("token_id",
                    kwargs={
                        "id": token_id,
                        "version": API_VERSION
                    }),
            json.dumps(token),
            content_type="application/json",
        )
        self.assertEqual(response.status_code, 200)

        activated_token = response.json()
        self.assertEqual(activated_token["id"], token_id)
        self.assertEqual(activated_token["is_activated"], True)
Пример #27
0
class APITestCases(APITestCase):
    def setUp(self):
        self.homo_sapiens = Organism(name="HOMO_SAPIENS",
                                     taxonomy_id=9606,
                                     is_scientific_name=True)
        self.homo_sapiens.save()
        self.danio_rerio = Organism(name="DANIO_RERIO",
                                    taxonomy_id=1337,
                                    is_scientific_name=True)
        self.danio_rerio.save()

    def test_compendia(self):
        result = ComputationalResult()
        result.save()

        hsc1 = ComputedFile()
        hsc1.absolute_file_path = "/null/1.tsv"
        hsc1.filename = "1.tsv"
        hsc1.sha1 = "abc"
        hsc1.size_in_bytes = 1
        hsc1.is_smashable = False
        hsc1.is_qn_target = False
        hsc1.result = result
        hsc1.is_compendia = True
        hsc1.compendia_organism = self.homo_sapiens
        hsc1.compendia_version = 1
        hsc1.s3_bucket = "dr-compendia"
        hsc1.s3_key = "hsc1.tsv"
        hsc1.save()

        hsc2 = ComputedFile()
        hsc2.absolute_file_path = "/null/2.tsv"
        hsc2.filename = "2.tsv"
        hsc2.sha1 = "abc"
        hsc2.size_in_bytes = 1
        hsc2.is_smashable = False
        hsc2.is_qn_target = False
        hsc2.result = result
        hsc2.is_compendia = True
        hsc2.compendia_organism = self.homo_sapiens
        hsc2.compendia_version = 2
        hsc2.s3_bucket = "dr-compendia"
        hsc2.s3_key = "hsc2.tsv"
        hsc2.save()

        drc1 = ComputedFile()
        drc1.absolute_file_path = "/null/1.tsv"
        drc1.filename = "1.tsv"
        drc1.sha1 = "abc"
        drc1.size_in_bytes = 1
        drc1.is_smashable = False
        drc1.is_qn_target = False
        drc1.result = result
        drc1.is_compendia = True
        drc1.compendia_organism = self.danio_rerio
        drc1.compendia_version = 1
        drc1.s3_bucket = "dr-compendia"
        drc1.s3_key = "drc2.tsv"
        drc1.save()

        response = self.client.get(
            reverse("computed_files", kwargs={"version": API_VERSION}),
            {"is_compendia": True})
        response_json = response.json()["results"]
        self.assertEqual(3, len(response_json))
        # Prove that the download_url field is missing and not None.
        self.assertEqual("NotPresent",
                         response_json[0].get("download_url", "NotPresent"))

        # We don't actually want AWS to generate a temporary URL for
        # us, and it won't unless we're running in the cloud, but if
        # we provide an API Token and use the WithUrl serializer then
        # it will set the download_url field to None rather than
        # generate one.

        # Create a token first
        response = self.client.post(
            reverse("token", kwargs={"version": API_VERSION}),
            json.dumps({"is_activated": True}),
            content_type="application/json",
        )
        token_id = response.json()["id"]

        response = self.client.get(
            reverse("computed_files", kwargs={"version": API_VERSION}),
            {"is_compendia": True},
            HTTP_API_KEY=token_id,
        )
        response_json = response.json()["results"]
        self.assertEqual(3, len(response_json))
        self.assertIsNone(response_json[0]["download_url"])
Пример #28
0
    def test_sra_redownloading(self):
        """Survey, download, then process an experiment we know is SRA."""
        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="HOMO_SAPIENS",
                                taxonomy_id=9606,
                                is_scientific_name=True)
            organism.save()

            survey_job = surveyor.survey_experiment("SRP040623", "SRA")

            self.assertTrue(survey_job.success)

            # This experiment has 4 samples that each need a downloader job.
            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 4)

            # We want one ProcessorJob to fail because it doesn't have
            # the file it was expecting, so we need to wait until one
            # DownloaderJob finishes, delete a file that is
            # downloaded, and then not delete any more.
            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )
            start_time = timezone.now()
            file_deleted = False
            for downloader_job in downloader_jobs:
                # We want to try and delete the file as quickly as
                # possible, so pass a short loop time and let the waiting
                # loop spin really fast so we lose as little time as
                # possible.
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time, 0.1)
                self.assertTrue(downloader_job.success)
                if not file_deleted:
                    for original_file in OriginalFile.objects.filter(
                            is_downloaded=True):
                        if not original_file.is_archive:
                            original_file.delete_local_file()
                            file_deleted = True

                            # And then to make sure that we can handle
                            # cases where the downloader job is missing:
                            downloader_job.delete()
                            break

            # There's a chance that the processor job with a missing
            # file is aborted before the last downloader job
            # completes, therefore just check that there's at least 3
            # processor jobs.
            processor_jobs = ProcessorJob.objects.all()
            self.assertGreater(processor_jobs.count(), 2)

            doomed_processor_job = original_file.processor_jobs.all()[0]
            logger.info(
                "Waiting on processor Nomad job %s to fail because it realized it is missing a file.",
                doomed_processor_job.nomad_job_id,
            )

            start_time = timezone.now()
            wait_for_job(doomed_processor_job, ProcessorJob, start_time)

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should
            # now be 5, but we also deleted on on purpose so there's 4.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 4)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create one
            # and only one processor job, which the total goes back up to 4:
            self.assertEqual(ProcessorJob.objects.all().count(), 4)

            # And finally we can make sure that all of the processor
            # jobs got started correctly, including the one that got
            # recreated. However in order to save time when running
            # tests, we don't actually want to run the full salmon
            # processor. Therefore we don't have the transcriptome
            # index that is needed for this organism so the jobs will
            # fail, but that failure happens past the point that we're
            # testing.
            # So we're gonna check for the correct failure_reason.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            good_failure_reason = "Missing transcriptome index."
            successful_processor_jobs = []
            for processor_job in processor_jobs:
                # One of the two calls to wait_for_job will fail
                # because the job is going to abort when it
                # finds that the file it wants to process is missing.
                try:
                    processor_job = wait_for_job(processor_job, ProcessorJob,
                                                 start_time)
                    if not processor_job.success and processor_job.failure_reason.startswith(
                            good_failure_reason):
                        successful_processor_jobs.append(processor_job)
                except Exception:
                    pass

            self.assertEqual(len(successful_processor_jobs), 4)
Пример #29
0
    def test_transcriptome_redownloading(self, mock_surveyor):
        """Survey, download, then process a transcriptome index. """

        mock_surveyor.side_effect = build_surveyor_init_mock(
            "TRANSCRIPTOME_INDEX")

        # Clear out pre-existing work dirs so there's no conflicts:
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            # I'm not sure why, but sometimes there are already downloader jobs
            # in the database from previous tests even though they should be
            # removed, so pause a bit
            time.sleep(10)
            downloader_jobs = DownloaderJob.objects.all()
            for job in downloader_jobs:
                print(job)
                print(job.accession_code)
            self.assertEqual(downloader_jobs.count(), 0)

            for length in ["LONG", "SHORT"]:
                work_dir_glob = (LOCAL_ROOT_DIR + "/Caenorhabditis_elegans/" +
                                 length + "/processor_job_*")
                for work_dir in glob.glob(work_dir_glob):
                    shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="CAENORHABDITIS_ELEGANS",
                                taxonomy_id=6239,
                                is_scientific_name=True)
            organism.save()

            # Make sure that we can delete the file before the processors begin
            # by preventing the downloaders from sending the processors
            # automatically. We send the jobs manually later
            no_dispatch = EnvironmentVarGuard()
            no_dispatch.set("AUTO_DISPATCH_NOMAD_JOBS", "False")
            with no_dispatch:
                survey_job = surveyor.survey_transcriptome_index(
                    "Caenorhabditis elegans", "Ensembl")

            self.assertTrue(survey_job.success)

            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), 1)

            logger.info(
                "Survey Job finished, waiting for Downloader Job with Nomad ID %s to complete.",
                downloader_jobs[0].nomad_job_id,
            )

            downloader_job = wait_for_job(downloader_jobs[0], DownloaderJob,
                                          timezone.now())
            self.assertTrue(downloader_job.success)

            og_file_to_delete = OriginalFile.objects.all()[0]
            os.remove(og_file_to_delete.absolute_file_path)

            processor_jobs = ProcessorJob.objects.all()
            for processor_job in processor_jobs:
                # FIXME: we run these in serial because of
                # https://github.com/AlexsLemonade/refinebio/issues/2321
                send_job(
                    ProcessorPipeline[processor_job.pipeline_applied],
                    job=processor_job,
                    is_dispatch=True,
                )
                try:
                    wait_for_job(processor_job, ProcessorJob, timezone.now())
                except Exception:
                    pass

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should now be two.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(), 2)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         timezone.now())
            self.assertTrue(recreated_job.success)

            # Once the Downloader job succeeds, it should create two
            # processor jobs, one for long and one for short indices.:
            processor_jobs = ProcessorJob.objects.all()
            self.assertEqual(processor_jobs.count(), 4)

            # Wait for the processor jobs to be dispatched
            time.sleep(15)

            # And finally we can make sure that both of the
            # processor jobs were successful, including the one that
            # got recreated.
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            successful_processor_jobs = []
            for processor_job in processor_jobs:
                processor_job.refresh_from_db()
                # One of the calls to wait_for_job will fail if the
                # job aborts before it we selected all the
                # processor jobs.
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             timezone.now())
                if processor_job.success:
                    successful_processor_jobs.append(processor_job)

            # While one of the original ProcessorJobs will  be aborted
            # it is hard to be sure of what will happen
            # to the other because of the racing that happens between
            # processor jobs getting started and us deleting the files
            # they need.
            # Therefore, we're just going to verify that one processor
            # job completed successfully for each length, since that
            # is the main thing we need.
            has_long = False
            has_short = False
            for processor_job in successful_processor_jobs:
                if processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_LONG":
                    has_long = True
                elif processor_job.pipeline_applied == "TRANSCRIPTOME_INDEX_SHORT":
                    has_short = True

            self.assertTrue(has_long)
            self.assertTrue(has_short)
Пример #30
0
    def test_geo_celgz_redownloading(self):
        """Survey, download, then process an experiment we know is Affymetrix.

        Each of the experiment's samples are in their own .cel.gz
        file, which is another way we expect GEO data to come.

        This is another test which uses Aspera so it unfortunately
        cannot be made to run without relying on NCBI's aspera server.
        """
        self.env = EnvironmentVarGuard()
        self.env.set("RUNING_IN_CLOUD", "False")
        with self.env:
            # Clear out pre-existing work dirs so there's no conflicts:
            for work_dir in glob.glob(LOCAL_ROOT_DIR + "/processor_job_*"):
                shutil.rmtree(work_dir)

            # Prevent a call being made to NCBI's API to determine
            # organism name/id.
            organism = Organism(name="MUS_MUSCULUS",
                                taxonomy_id=10090,
                                is_scientific_name=True)
            organism.save()

            accession_code = "GSE100388"
            survey_job = surveyor.survey_experiment(accession_code, "GEO")

            SAMPLES_IN_EXPERIMENT = 15

            self.assertTrue(survey_job.success)

            # This experiment's samples each have their own file so
            # they each get their own downloader job.
            downloader_jobs = DownloaderJob.objects.all()
            self.assertEqual(downloader_jobs.count(), SAMPLES_IN_EXPERIMENT)

            logger.info(
                "Survey Job finished, waiting for Downloader Jobs to complete."
            )

            # We're going to spin as fast as we can so we can delete
            # the file in between when the downloader jobs finishes and
            # the processor job starts.
            start_time = timezone.now()
            file_deleted = False
            while not file_deleted and timezone.now(
            ) - start_time < MAX_WAIT_TIME:
                non_archive_files = OriginalFile.objects.filter(
                    is_archive=False)
                for original_file in non_archive_files:
                    if original_file.absolute_file_path and os.path.exists(
                            original_file.absolute_file_path):
                        os.remove(original_file.absolute_file_path)
                        file_deleted = True
                        break

            # Wait for each of the DownloaderJobs to finish
            for downloader_job in downloader_jobs:
                downloader_job = wait_for_job(downloader_job, DownloaderJob,
                                              start_time)
                self.assertTrue(downloader_job.success)

            try:
                doomed_processor_job = original_file.processor_jobs.all()[0]
            except Exception:
                # The doomed job may be aborted before we can get
                # it. This is fine, we just can't look at it.
                doomed_processor_job = None

            if doomed_processor_job:
                logger.info(
                    "Waiting on processor Nomad job %s to fail because it realized it is missing a file.",
                    doomed_processor_job.nomad_job_id,
                )

                start_time = timezone.now()
                doomed_processor_job = wait_for_job(doomed_processor_job,
                                                    ProcessorJob, start_time)
                self.assertTrue(doomed_processor_job.abort)

            # The processor job that had a missing file will have
            # recreated its DownloaderJob, which means there should
            # now be SAMPLES_IN_EXPERIMENT + 1 downloader jobs.
            downloader_jobs = DownloaderJob.objects.all().order_by("-id")
            self.assertEqual(downloader_jobs.count(),
                             SAMPLES_IN_EXPERIMENT + 1)

            # However DownloaderJobs don't get queued immediately, so
            # we have to run a foreman function to make it happen:
            retry_lost_downloader_jobs()

            # And we can check that the most recently created
            # DownloaderJob was successful as well:
            recreated_job = downloader_jobs[0]
            recreated_job.refresh_from_db()
            logger.info("Waiting on downloader Nomad job %s",
                        recreated_job.nomad_job_id)
            recreated_job = wait_for_job(recreated_job, DownloaderJob,
                                         start_time)
            self.assertTrue(recreated_job.success)

            # And finally we can make sure that all of the processor
            # jobs were successful, including the one that got
            # recreated. The processor job that recreated that job has
            # abort=True
            logger.info(
                "Downloader Jobs finished, waiting for processor Jobs to complete."
            )
            processor_jobs = ProcessorJob.objects.all().exclude(
                abort=True)  # exclude aborted jobs
            for processor_job in processor_jobs:
                processor_job = wait_for_job(processor_job, ProcessorJob,
                                             start_time)
                self.assertTrue(processor_job.success)

            self.assertEqual(processor_jobs.count(), SAMPLES_IN_EXPERIMENT)