def generate_motif_features(feature: CDSFeature, motifs: List[HMMResult]) -> List[CDSMotif]: """ Convert a list of HMMResult to a list of CDSMotif features """ # use a locus tag if one exists locus_tag = feature.get_name() if feature.locus_tag: locus_tag = feature.locus_tag motif_features = [] for i, motif in enumerate(motifs): i += 1 # user facing, so 1-indexed loc = feature.get_sub_location_from_protein_coordinates( motif.query_start, motif.query_end) prot_loc = FeatureLocation(motif.query_start, motif.query_end) new_motif = CDSMotif(loc, feature.get_name(), prot_loc, tool="nrps_pks_domains") new_motif.label = motif.hit_id new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i) new_motif.evalue = motif.evalue new_motif.score = motif.bitscore new_motif.detection = "hmmscan" new_motif.database = "abmotifs" new_motif.locus_tag = locus_tag new_motif.translation = feature.translation[motif.query_start:motif. query_end] motif_features.append(new_motif) return motif_features
def generate_domain_features(gene: CDSFeature, domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]: """ Generates AntismashDomain features for each provided HMMResult Arguments: gene: the CDSFeature the domains were found in domains: a list of HMMResults found in the CDSFeature Returns: a dictionary mapping the HMMResult used to the matching AntismashDomain """ new_features = {} domain_counts = defaultdict(int) # type: Dict[str, int] for domain in domains: loc = gene.get_sub_location_from_protein_coordinates(domain.query_start, domain.query_end) # set up new feature new_feature = AntismashDomain(loc, tool="nrps_pks_domains") new_feature.domain = domain.hit_id new_feature.locus_tag = gene.locus_tag or gene.get_name() new_feature.detection = "hmmscan" new_feature.database = "nrpspksdomains.hmm" new_feature.evalue = domain.evalue new_feature.score = domain.bitscore new_feature.translation = gene.translation[domain.query_start:domain.query_end + 1] domain_counts[domain.hit_id] += 1 # 1-indexed, so increment before use domain_name = "{}_{}.{}".format(gene.get_name(), domain.hit_id, domain_counts[domain.hit_id]) new_feature.domain_id = "nrpspksdomains_" + domain_name new_feature.label = domain_name new_features[domain] = new_feature return new_features
def test_simple_location_forward_complete(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) extracted = new.extract(self.magic) assert extracted == self.magic assert extracted.translate() == self.translation
def test_compound_location_reverse_single(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 21 assert new.end == 27 assert new.extract( self.magic_split).translate() == self.translation[0:2] new = cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract( self.magic_split).translate() == self.translation[2:3]
def test_compound_location_reverse_full(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(cds.location) assert new.extract( self.magic_split).translate() == self.translation[0:5]
def test_simple_location_forward_partial(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]: print("testing", start, end) new = cds.get_sub_location_from_protein_coordinates(start, end) print(new) extracted = new.extract(self.magic) assert extracted == self.magic[start * 3:end * 3] assert extracted.translate() == self.translation[start:end]
def test_compound_location_reverse_multiple(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 3 assert new.parts[1].end == 6 assert new.extract( self.magic_split).translate() == self.translation[2:4]
def test_frameshifted_location(self): location = CompoundLocation( [FeatureLocation(3, 9, 1), FeatureLocation(8, 14, 1)]) assert len(location) == 12 seq = Seq("ATGATGAGCCCTCGTCTAGACTACAATGA") extracted = location.extract(seq) assert extracted == "ATGAGCCCCTCG" assert len(extracted) == len(location) translation = extracted.translate() assert translation == "MSPS" cds = CDSFeature(location, locus_tag="test", translation=translation) new = cds.get_sub_location_from_protein_coordinates(1, 3) assert isinstance(new, CompoundLocation) assert len(new.parts) == 2 assert new.start == 6 assert new.end == 11
def test_complicated(self): parts = [ FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1), FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1), FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1), FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1), FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1) ] location = CompoundLocation(parts, operator="join") cds = CDSFeature(location, locus_tag="complicated", translation="A") seq = ( "ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT" "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT" "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT" "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT" "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT" "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC" "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG" "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC" "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC" "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA" "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA" "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT" "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG" "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA" "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC" "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT" "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC" "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC" "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG" "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC" "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA" "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG" "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA" "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG" "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC" "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT" "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG" "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT" "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT" "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG" "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG" "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA" "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC" "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG" "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA" "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG" "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG" "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC" "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG" "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA" "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA" "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG" "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC" "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT" "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA" "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA" "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA" "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG" "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG" "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA" ) translation = ( "MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR" "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ" "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR" "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV" "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR" "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF" "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI" "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI" "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP" "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG" "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL" "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT" "HTITLR") new = cds.get_sub_location_from_protein_coordinates(353, 412) # pad the beginning to match the location assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412]
class TestCDSProteinLocation(unittest.TestCase): def setUp(self): self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT") self.magic = Seq("ATGGCAGGTATTTGT") self.translation = "MAGIC" self.sub_locations = [ FeatureLocation(0, 6, strand=1), FeatureLocation(12, 15, strand=1), FeatureLocation(21, 27, strand=1) ] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound", translation="A") def reverse_strand(self): self.magic = self.magic.reverse_complement() self.magic_split = self.magic_split.reverse_complement() self.sub_locations = [ FeatureLocation(loc.start, loc.end, strand=loc.strand * -1) for loc in self.sub_locations ] self.location = CompoundLocation( self.sub_locations[::self.sub_locations[0].strand]) self.cds = CDSFeature(self.location, locus_tag="compound", translation="A") def test_simple_location_forward_complete(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) extracted = new.extract(self.magic) assert extracted == self.magic assert extracted.translate() == self.translation def test_simple_location_forward_partial(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple", translation="A") for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]: print("testing", start, end) new = cds.get_sub_location_from_protein_coordinates(start, end) print(new) extracted = new.extract(self.magic) assert extracted == self.magic[start * 3:end * 3] assert extracted.translate() == self.translation[start:end] def test_compound_location_forward_full(self): new = self.cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, self.cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(self.cds.location) assert new == self.location, "%s != %s" % (str(new), str( self.location)) extracted = new.extract(self.magic_split) assert extracted == self.magic assert extracted.translate() == self.translation[0:5] def test_compound_forward_within_single(self): new = self.cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 0 assert new.end == 6 assert new.extract( self.magic_split).translate() == self.translation[0:2] new = self.cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract( self.magic_split).translate() == self.translation[2:3] def test_compound_forward_over_multiple(self): new = self.cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, self.cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 21 assert new.parts[1].end == 24 assert new.extract( self.magic_split).translate() == self.translation[2:4] def test_compound_location_reverse_full(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(cds.location) assert new.extract( self.magic_split).translate() == self.translation[0:5] def test_compound_location_reverse_single(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 21 assert new.end == 27 assert new.extract( self.magic_split).translate() == self.translation[0:2] new = cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract( self.magic_split).translate() == self.translation[2:3] def test_compound_location_reverse_multiple(self): self.reverse_strand() cds = CDSFeature(self.location, locus_tag="compound", translation="A") new = cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 3 assert new.parts[1].end == 6 assert new.extract( self.magic_split).translate() == self.translation[2:4] def test_frameshifted_location(self): location = CompoundLocation( [FeatureLocation(3, 9, 1), FeatureLocation(8, 14, 1)]) assert len(location) == 12 seq = Seq("ATGATGAGCCCTCGTCTAGACTACAATGA") extracted = location.extract(seq) assert extracted == "ATGAGCCCCTCG" assert len(extracted) == len(location) translation = extracted.translate() assert translation == "MSPS" cds = CDSFeature(location, locus_tag="test", translation=translation) new = cds.get_sub_location_from_protein_coordinates(1, 3) assert isinstance(new, CompoundLocation) assert len(new.parts) == 2 assert new.start == 6 assert new.end == 11 def test_complicated(self): parts = [ FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1), FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1), FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1), FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1), FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1) ] location = CompoundLocation(parts, operator="join") cds = CDSFeature(location, locus_tag="complicated", translation="A") seq = ( "ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT" "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT" "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT" "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT" "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT" "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC" "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG" "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC" "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC" "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA" "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA" "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT" "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG" "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA" "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC" "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT" "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC" "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC" "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG" "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC" "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA" "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG" "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA" "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG" "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC" "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT" "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG" "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT" "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT" "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG" "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG" "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA" "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC" "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG" "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA" "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG" "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG" "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC" "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG" "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA" "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA" "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG" "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC" "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT" "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA" "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA" "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA" "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG" "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG" "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA" ) translation = ( "MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR" "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ" "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR" "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV" "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR" "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF" "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI" "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI" "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP" "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG" "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL" "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT" "HTITLR") new = cds.get_sub_location_from_protein_coordinates(353, 412) # pad the beginning to match the location assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412] def test_extends_past_after(self): self.sub_locations[-1] = FeatureLocation(21, AfterPosition(29), strand=1) self.cds.location = CompoundLocation(self.sub_locations) new = self.cds.get_sub_location_from_protein_coordinates(0, 7) assert new.end == 27 def test_extends_past_before(self): self.reverse_strand() self.sub_locations[0] = FeatureLocation(BeforePosition(2), self.sub_locations[0].end, strand=-1) self.cds.location = CompoundLocation(self.sub_locations[::-1]) new = self.cds.get_sub_location_from_protein_coordinates(0, 7) assert new.start == 3