Exemplo n.º 1
0
    def handle(self, *args, **options):
        if options["job_id"] is None:
            logger.error("You must specify a job ID.",
                         job_id=options["job_id"])
            sys.exit(1)

        try:
            job_type = ProcessorPipeline[options["job_name"]]
        except KeyError:
            logger.error("You must specify a valid job name.",
                         job_name=options["job_name"],
                         job_id=options["job_id"])
            sys.exit(1)

        if job_type is ProcessorPipeline.AFFY_TO_PCL:
            from data_refinery_workers.processors.array_express import affy_to_pcl
            affy_to_pcl(options["job_id"])
        elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_SHORT:
            from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index
            build_transcriptome_index(options["job_id"], length="short")
        elif job_type is ProcessorPipeline.TRANSCRIPTOME_INDEX_LONG:
            from data_refinery_workers.processors.transcriptome_index import build_transcriptome_index
            build_transcriptome_index(options["job_id"], length="long")
        elif job_type is ProcessorPipeline.AGILENT_TWOCOLOR_TO_PCL:
            from data_refinery_workers.processors.agilent_twocolor import agilent_twocolor_to_pcl
            agilent_twocolor_to_pcl(options["job_id"])
        elif job_type is ProcessorPipeline.ILLUMINA_TO_PCL:
            from data_refinery_workers.processors.illumina import illumina_to_pcl
            illumina_to_pcl(options["job_id"])
        elif job_type is ProcessorPipeline.SALMON:
            from data_refinery_workers.processors.salmon import salmon
            salmon(options["job_id"])
        elif job_type is ProcessorPipeline.SMASHER:
            from data_refinery_workers.processors.smasher import smash
            smash(options["job_id"])
        elif job_type is ProcessorPipeline.NO_OP:
            from data_refinery_workers.processors.no_op import no_op_processor
            no_op_processor(options["job_id"])
        elif job_type is ProcessorPipeline.JANITOR:
            from data_refinery_workers.processors.janitor import run_janitor
            run_janitor(options["job_id"])
        elif job_type is ProcessorPipeline.QN_REFERENCE:
            from data_refinery_workers.processors import qn_reference
            qn_reference.create_qn_reference(options["job_id"])
        else:
            logger.error(
                ("A valid job name was specified for job %s with id %d but "
                 "no processor function is known to run it."),
                options["job_name"], options["job_id"])
            sys.exit(1)

        sys.exit(0)
Exemplo n.º 2
0
    def test_illumina_no_pvalue(self):
        """This experiment should fail because it has no p-value columns, so
        make sure it fails at that stage of the processing"""
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE41nnn/GSE41355/suppl/GSE41355%5Fnon%2Dnormalized%2Etxt%2Egz",
            "filename":
            "GSE41355_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE41355_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM1015436", "IRF3/7 DKO 2"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)

        self.assertFailed(pj, "Could not detect PValue column!")
Exemplo n.º 3
0
    def test_illumina_id_ref_column_with_whitespace(self):
        """This test case tests the issue brought up in
        https://github.com/alexslemonade/refinebio/issues/1560
        where an ID_REF column would not be detected because the column name had a trailing space
        """

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE100nnn/GSE100301/suppl/GSE100301%5Fnon%2Dnormalized%2Etxt%2Egz",
            "filename":
            "GSE100301_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE100301_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM2677583",
                    "22Rv1-tetO-Gal4, replicate 1",
                    {
                        "description": ["SAMPLE 1"],
                    },
                ),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
Exemplo n.º 4
0
    def test_illumina_rows_starting_with_whitespace(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE112nnn/GSE112517/suppl/GSE112517_non-normalized.txt.gz",
            "filename":
            "GSE112517_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE112517_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM3071991",
                    "MCF-7 KLHDC7B siRNA knockdown control",
                    {
                        "description": ["SAMPLE 1"],
                    },
                ),
                (
                    "GSM3071992",
                    "MCF-7 KLHDC7B siRNA knockdown",
                    {
                        "description": ["SAMPLE 2"],
                    },
                ),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
Exemplo n.º 5
0
    def test_illumina_to_pcl(self):
        """Most basic Illumina to PCL test"""

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})

        # Remove the title of one of the samples to make sure that we can still
        # find its detection column using the description given as an annotation
        sample = Sample.objects.get(title="LV-T350A&si-EZH2-3")
        sample.title = "ignoreme_for_description"
        sample.accession_code = "ignoreme_for_description"
        sample.save()

        final_context = illumina.illumina_to_pcl(job.pk, cleanup=False)
        self.assertSucceeded(job)

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 6
0
    def test_bad_illumina_detection(self):
        """ With the wrong species, this will fail the platform detection threshold. """
        from data_refinery_workers.processors import illumina
        job = prepare_illumina_job('RATTUS_NORVEGICUS')
        final_context = illumina.illumina_to_pcl(job.pk)
        self.assertTrue(final_context['abort'])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 7
0
    def test_illumina_to_pcl(self):
        """ Most basic Illumina to PCL test """
        from data_refinery_workers.processors import illumina
        job = prepare_illumina_job()
        final_context = illumina.illumina_to_pcl(job.pk)

        for sample in final_context['samples']:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 8
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works. """
        from data_refinery_workers.processors import illumina

        pj = ProcessorJob()
        pj.pipeline_applied = "ILLUMINA_TO_PCL"
        pj.save()

        og_file = OriginalFile()
        og_file.source_filename = "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz"
        og_file.filename = "GSE54661_non_normalized.txt"
        og_file.absolute_file_path = (
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt"
        )
        og_file.is_downloaded = True
        og_file.save()

        assoc1 = ProcessorJobOriginalFileAssociation()
        assoc1.original_file = og_file
        assoc1.processor_job = pj
        assoc1.save()

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        sample = Sample()
        sample.accession_code = "ABCD-1234"
        sample.title = "hypoxia_Signal"
        sample.organism = organism
        sample.save()

        sample_assoc = OriginalFileSampleAssociation()
        sample_assoc.original_file = og_file
        sample_assoc.sample = sample
        sample_assoc.save()

        final_context = illumina.illumina_to_pcl(pj.pk)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 9
0
    def test_bad_illumina_detection(self):
        """With the wrong species, this will fail the platform detection threshold."""

        organism = Organism(name="RATTUS_NORVEGICUS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job({**GSE22427, "organism": organism})
        final_context = illumina.illumina_to_pcl(job.pk, cleanup=False)
        self.assertTrue(final_context["abort"])

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 10
0
    def test_illumina_to_pcl(self):
        """ Most basic Illumina to PCL test """
        from data_refinery_workers.processors import illumina

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        job = prepare_illumina_job(organism)
        final_context = illumina.illumina_to_pcl(job.pk)

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))
            os.remove(smashme.absolute_file_path)

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 11
0
    def test_illumina_quoted_row_names(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE33nnn/GSE33814/suppl/GSE33814%5Fnon%2Dnormalized%2Etxt%2Egz",
            # Some of the columns are trimmed to save space and time
            "filename":
            "GSE33814_trimmed_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE33814_trimmed_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM836222", "IMGUS_32"),
                ("GSM836223", "IMGUS_33"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)

        # Make sure that the row names are no longer quoted after sanitizing the file
        def assertNotQuoted(string: str):
            string = string.strip()
            self.assertNotEqual(string[0], '"')
            self.assertNotEqual(string[-1], '"')

        with open(final_context["sanitized_file_path"], "r") as f:
            reader = csv.reader(f, delimiter="\t")

            headers = next(reader)
            for header in headers:
                assertNotQuoted(header)

            # Also make sure the probe IDs aren't qutoed
            first_row = next(reader)
            assertNotQuoted(first_row[0])
Exemplo n.º 12
0
    def test_illumina_space_separated(self):
        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE48nnn/GSE48023/suppl/GSE48023%5Fnon%2Dnormalized%2Etxt%2Egz",
            # Some of the columns are trimmed to save space and time
            "filename":
            "GSE48023_trimmed_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE48023_trimmed_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                ("GSM1165512", "WholeBloodRNA_IN0242_Day0"),
                ("GSM1165513", "WholeBloodRNA_IN0242_Day1"),
                ("GSM1165514", "WholeBloodRNA_IN0242_Day14"),
                ("GSM1165515", "WholeBloodRNA_IN0242_Day3"),
                ("GSM1165516", "WholeBloodRNA_IN0243_Day0"),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)

        # Assert that the sanitized file is tab-separated (by reading it as a
        # TSV and making sure it has 11 headers) and has an extra ID_REF header
        with open(final_context["sanitized_file_path"], "r") as f:
            reader = csv.reader(f, delimiter="\t")

            headers = next(reader)

            # ID_REF + 5 observations + 5 p-values
            self.assertEqual(len(headers), 11)
            self.assertEqual(headers[0], "ID_REF")
Exemplo n.º 13
0
    def test_good_detection(self):
        """GSE54661 appears to be mislabled (illuminaHumanv4) on GEO. Shows our detector works."""

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE54nnn/GSE54661/suppl/GSE54661%5Fnon%5Fnormalized%2Etxt%2Egz",
            "filename":
            "GSE54661_non_normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE54661_non_normalized.txt",
            "organism":
            organism,
            "samples": [("ABCD-1234", "CB CD34+ hypoxia"),
                        ("ABCD-1235", "CB CD34+ normoxia")],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        self.assertSucceeded(pj)
        self.assertEqual(final_context["platform"], "illuminaHumanv3")

        for key in final_context["samples"][0].sampleannotation_set.all(
        )[0].data.keys():
            self.assertTrue(key in [
                "detected_platform", "detection_percentage",
                "mapped_percentage"
            ])

        for sample in final_context["samples"]:
            smashme = sample.get_most_recent_smashable_result_file()
            self.assertTrue(os.path.exists(smashme.absolute_file_path))

        # Cleanup after the job since it won't since we aren't running in cloud.
        shutil.rmtree(final_context["work_dir"], ignore_errors=True)
Exemplo n.º 14
0
    def test_illumina_latin1_input(self):
        """Test a latin1-encoded Illumina file.

        GSE106321 is encoded in latin1 and uses μ in the title of some
        columns, so preparing the file would cause a UnicodeParseError. Make
        sure that doesn't happen any more.
        """

        organism = Organism(name="HOMO_SAPIENS",
                            taxonomy_id=9606,
                            is_scientific_name=True)
        organism.save()

        pj = prepare_illumina_job({
            "source_filename":
            "ftp://ftp.ncbi.nlm.nih.gov/geo/series/GSE106nnn/GSE106321/suppl/GSE106321_non-normalized.txt.gz",
            "filename":
            "GSE106321_non-normalized.txt",
            "absolute_file_path":
            "/home/user/data_store/raw/TEST/ILLUMINA/GSE106321_non-normalized.txt",
            "organism":
            organism,
            "samples": [
                (
                    "GSM2835938",
                    "A375 + 24h vem (3µM) 2",
                    {
                        "description": ["A375 + 24h vem (3µM) 2"]
                    },
                ),
                (
                    "GSM2835937",
                    "A375 + 24h vem (3µM) 1",
                    {
                        "description": ["A375 + 24h vem (3µM) 1"]
                    },
                ),
                (
                    "GSM2835936",
                    "A375 + 24h vem (3µM)",
                    {
                        "description": ["A375 + 24h vem (3µM)"]
                    },
                ),
                ("GSM2835935", "A375 + 24h DMSO 2", {
                    "description": ["A375 + 24h DMSO 2"]
                }),
                ("GSM2835934", "A375+ 24h DMSO 1", {
                    "description": ["A375+ 24h DMSO 1"]
                }),
                ("GSM2835933", "A375 + 24h DMSO", {
                    "description": ["A375 + 24h DMSO"]
                }),
            ],
        })

        final_context = illumina.illumina_to_pcl(pj.pk, cleanup=False)
        # XXX: For now, this processor job fails, but we want to make sure that it fails in the right place
        # See https://github.com/AlexsLemonade/refinebio/issues/2870 for why it is failing.
        self.assertFailed(
            pj,
            "Encountered error in R code while running illumina.R pipeline during processing"
        )

        # Make sure that the input is now utf-8 encoded and has the right headers.

        # Trying to open a latin1 file as utf-8 would cause an
        # exception to be thrown, so if opening succeeds we can assume the encoding succeeded.
        with open(final_context["sanitized_file_path"], "r",
                  encoding="utf-8") as f:
            reader = csv.reader(f, delimiter="\t")

            # Check the headers to make sure that the mu was correctly re-encoded
            headers = next(reader)
            self.assertEqual(
                headers,
                [
                    "ID_REF",
                    "A375 + 24h DMSO",
                    "Detection Pval",
                    "A375+ 24h DMSO 1",
                    "Detection Pval",
                    "A375 + 24h DMSO 2",
                    "Detection Pval",
                    "A375 + 24h vem (3µM)",
                    "Detection Pval",
                    "A375 + 24h vem (3µM) 1",
                    "Detection Pval",
                    "A375 + 24h vem (3µM) 2",
                    "Detection Pval",
                    "",
                    "",
                    "",
                    "",
                    "",
                    "",
                ],
            )