def test_additional_fasta(self, mock_download, shared_prebuild): g = EnsemblGenome("Ustilago_maydis", 33, shared_prebuild) ppg.run_pipegraph() seq = g.get_genome_sequence("U37796.1", 0, 100) assert (seq == ( "taatcgtgaattgagctaggggcgccaagttacgtggcaaaagcgggctgactggcggcgaagatgtgt" "tggtctgcacctgagttcacgaacctgagac").upper())
def test_download(self, new_pipegraph, mock_download, shared_prebuild): species = "Ashbya_gossypii" # the smallest eukaryotic species at the time of writing this at 2.8 mb g = EnsemblGenome(species, "41", prebuild_manager=shared_prebuild) def shorten_genome_fasta(output_path): with open(g.find_file("genome.fasta")) as op: head = op.read(1024 * 100) (output_path / "test.fasta").write_text(head) test_fasta_job = g.prebuild_manager.prebuild( f"ensembl/{g.species}_{g.revision}/test_fasta", "1", [], ["test.fasta"], shorten_genome_fasta, ) test_fasta_job.depends_on(g.download_genome()) g._prebuilds.append(test_fasta_job) subread = Subread(version="1.6.3") index = g.build_index(subread, "test.fasta") subread_old = Subread(version="1.4.3-p1") index_old = g.build_index(subread_old, "test.fasta") new_pipegraph.run() # note that these are not the checksums from CHECKSUMS files (those are fore # the gziped variants, we keep them ungziped and let the filesystem handle # the gzip, since we can't rely on the downstream reading gzip... assert (checksum_file( g.find_file("genome.fasta")) == "584a734589964a654c7c1dc23b0167ab") assert (checksum_file( g.find_file("cdna.fasta")) == "3fc1f19ab829573169cb2488abe39211") assert (checksum_file( g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b") assert (checksum_file( g.find_file("genes.gtf")) == "8bdeec9b3db5278668dbff8b34e9d93b") assert (checksum_file( g.find_file("pep.fasta")) == "9580fd44832d419c38469d657f6e2484") with pytest.raises(OSError): g.find_file("no such file") assert index.name_file("subread_index.reads").exists() assert index.name_file("subread_index.files").exists() assert index.name_file("subread_index.00.b.array").exists() assert index_old.name_file("subread_index.reads").exists() assert index_old.name_file("subread_index.files").exists() assert index_old.name_file("subread_index.00.b.array").exists() assert index.name_file("subread_index.reads") != index_old.name_file( "subread_index.reads") assert g.find_file("test.fasta.md5sum").exists() with pytest.raises(OSError): assert g.find_file("test.fasta.md5sum.nosuchfile").exists() assert g.find_prebuild("test.fasta") is test_fasta_job with pytest.raises(OSError): assert g.find_prebuild("test.fasta.md5sum.nosuchfile").exists() assert g.find_file("genome.fasta.fai").exists() assert g.find_file("cdna.fasta.fai").exists() new_pipegraph.new_pipegraph() pb = PrebuildManager(shared_prebuild.prebuilt_path) g = EnsemblGenome(species, "41", prebuild_manager=pb) test_fasta_job = g.prebuild_manager.prebuild( f"ensembl/{g.species}_{g.revision}/test_fasta", "1", [], ["test.fasta"], shorten_genome_fasta, ) g._prebuilds.append(test_fasta_job) subread_intermediate = Subread(version="1.5.0") index_intermediate = g.build_index(subread_intermediate, "test.fasta") assert index_intermediate.name_file( "subread_index.reads") == index_old.name_file( "subread_index.reads") index_genome = g.build_index(subread_intermediate) assert "/genome/" in str(index_genome.filenames[0]) assert g.get_chromosome_lengths() == { "IV": 1_467_287, "MT": 23564, "V": 1_519_140, "III": 907_494, "II": 870_771, "VII": 1_800_949, "I": 693_414, "VI": 1_836_693, } assert g.get_genome_sequence("VI", 20, 30) == "ACCGCTGAGA" assert (g.get_cdna_sequence("EFAGOT00000000349") == "GCTCGCGTGGCGTAATGGCAACGCGTCTGACTTCTAATCAGAAGATTGTGGGTTCGACCC" "CCACCGTGAGTG") assert (g.get_protein_sequence("AAS53315") == "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR" "RKGRVYVYCKSNNKHKQRQG") assert (g.genetic_code.translate_dna(g.get_cds_sequence("AAS53315")) == "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR" "RKGRVYVYCKSNNKHKQRQG") assert (g.genetic_code.translate_dna( g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53315"])) == "MFSTRICSLLARPFMVPIVPRFGSALLQKPLNGVVVPQFTRGFKVRTSVKKFCAHCYIVR" "RKGRVYVYCKSNNKHKQRQG") with pytest.raises(ValueError): g.get_cds_sequence("AAS53315", g.df_proteins.loc["AAS53316"]) assert ( g.df_genes_meta.loc["AGOS_ADL186C"]["description"] == "Restriction of telomere capping protein 1 [Source:UniProtKB/Swiss-Prot;Acc:Q75AV6]" )