예제 #1
0
 def test_additional_fasta(self, mock_download, shared_prebuild):
     g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild)
     ppg.run_pipegraph()
     seq = g.get_genome_sequence("U37796.1", 0, 100)
     assert (seq == (
         "taatcgtgaattgagctaggggcgccaagttacgtggcaaaagcgggctgactggcggcgaagatgtgt"
         "tggtctgcacctgagttcacgaacctgagac").upper())
예제 #2
0
    def test_download(self, new_pipegraph, mock_download, shared_prebuild):
        species = "Ashbya_gossypii"  # the smallest eukaryotic species at the time of writing this at 2.8 mb
        g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild)

        def shorten_genome_fasta(output_path):
            with open(g.find_file("genome.fasta")) as op:
                head = op.read(1024 * 100)
            (output_path / "test.fasta").write_text(head)

        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        test_fasta_job.depends_on(g.download_genome())
        g._prebuilds.append(test_fasta_job)

        subread = Subread(version="1.6.3")
        index = g.build_index(subread, "test.fasta")
        subread_old = Subread(version="1.4.3-p1")
        index_old = g.build_index(subread_old, "test.fasta")

        new_pipegraph.run()
        # note that these are not the checksums from CHECKSUMS files (those are fore
        # the gziped variants, we keep them ungziped and let the filesystem handle
        # the gzip, since we can't rely on the downstream reading gzip...
        assert (checksum_file(
            g.find_file("genome.fasta")) == "584a734589964a654c7c1dc23b0167ab")
        assert (checksum_file(
            g.find_file("cdna.fasta")) == "3fc1f19ab829573169cb2488abe39211")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b")
        assert (checksum_file(
            g.find_file("pep.fasta")) == "9580fd44832d419c38469d657f6e2484")
        with pytest.raises(OSError):
            g.find_file("no such file")
        assert index.name_file("subread_index.reads").exists()
        assert index.name_file("subread_index.files").exists()
        assert index.name_file("subread_index.00.b.array").exists()
        assert index_old.name_file("subread_index.reads").exists()
        assert index_old.name_file("subread_index.files").exists()
        assert index_old.name_file("subread_index.00.b.array").exists()
        assert index.name_file("subread_index.reads") != index_old.name_file(
            "subread_index.reads")
        assert g.find_file("test.fasta.md5sum").exists()
        with pytest.raises(OSError):
            assert g.find_file("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_prebuild("test.fasta") is test_fasta_job
        with pytest.raises(OSError):
            assert g.find_prebuild("test.fasta.md5sum.nosuchfile").exists()
        assert g.find_file("genome.fasta.fai").exists()
        assert g.find_file("cdna.fasta.fai").exists()

        new_pipegraph.new_pipegraph()
        pb = PrebuildManager(shared_prebuild.prebuilt_path)
        g = EnsemblGenome(species, "41", prebuild_manager=pb)
        test_fasta_job = g.prebuild_manager.prebuild(
            f"ensembl/{g.species}_{g.revision}/test_fasta",
            "1",
            [],
            ["test.fasta"],
            shorten_genome_fasta,
        )
        g._prebuilds.append(test_fasta_job)

        subread_intermediate = Subread(version="1.5.0")
        index_intermediate = g.build_index(subread_intermediate, "test.fasta")
        assert index_intermediate.name_file(
            "subread_index.reads") == index_old.name_file(
                "subread_index.reads")
        index_genome = g.build_index(subread_intermediate)
        assert "/genome/" in str(index_genome.filenames[0])

        assert g.get_chromosome_lengths() == {
            "IV": 1_467_287,
            "MT": 23564,
            "V": 1_519_140,
            "III": 907_494,
            "II": 870_771,
            "VII": 1_800_949,
            "I": 693_414,
            "VI": 1_836_693,
        }

        assert g.get_genome_sequence("VI", 20, 30) == "ACCGCTGAGA"
        assert (g.get_cdna_sequence("EFAGOT00000000349") ==
                "GCTCGCGTGGCGTAATGGCAACGCGTCTGACTTCTAATCAGAAGATTGTGGGTTCGACCC"
                "CCACCGTGAGTG")
        assert (g.get_protein_sequence("AAS53315") ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        assert (g.genetic_code.translate_dna(g.get_cds_sequence("AAS53315")) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")

        assert (g.genetic_code.translate_dna(
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53315"])) ==
                "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR"
                "RKGRVYVYCKSNNKHKQRQG")
        with pytest.raises(ValueError):
            g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53316"])

        assert (
            g.df_genes_meta.loc["AGOS_ADL186C"]["description"] ==
            "Restriction of telomere capping protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q75AV6]"
        )