Exemplo n.º 1
0
def generate_motif_features(feature: CDSFeature,
                            motifs: List[HMMResult]) -> List[CDSMotif]:
    """ Convert a list of HMMResult to a list of CDSMotif features """
    # use a locus tag if one exists
    locus_tag = feature.get_name()
    if feature.locus_tag:
        locus_tag = feature.locus_tag

    motif_features = []
    for i, motif in enumerate(motifs):
        i += 1  # user facing, so 1-indexed
        loc = feature.get_sub_location_from_protein_coordinates(
            motif.query_start, motif.query_end)
        prot_loc = FeatureLocation(motif.query_start, motif.query_end)
        new_motif = CDSMotif(loc,
                             feature.get_name(),
                             prot_loc,
                             tool="nrps_pks_domains")
        new_motif.label = motif.hit_id
        new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i)
        new_motif.evalue = motif.evalue
        new_motif.score = motif.bitscore
        new_motif.detection = "hmmscan"
        new_motif.database = "abmotifs"
        new_motif.locus_tag = locus_tag

        new_motif.translation = feature.translation[motif.query_start:motif.
                                                    query_end]

        motif_features.append(new_motif)
    return motif_features
def generate_domain_features(gene: CDSFeature, domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]:
    """ Generates AntismashDomain features for each provided HMMResult

        Arguments:
            gene: the CDSFeature the domains were found in
            domains: a list of HMMResults found in the CDSFeature

        Returns:
            a dictionary mapping the HMMResult used to the matching AntismashDomain
    """
    new_features = {}
    domain_counts = defaultdict(int)  # type: Dict[str, int]
    for domain in domains:
        loc = gene.get_sub_location_from_protein_coordinates(domain.query_start, domain.query_end)

        # set up new feature
        new_feature = AntismashDomain(loc, tool="nrps_pks_domains")
        new_feature.domain = domain.hit_id
        new_feature.locus_tag = gene.locus_tag or gene.get_name()
        new_feature.detection = "hmmscan"
        new_feature.database = "nrpspksdomains.hmm"
        new_feature.evalue = domain.evalue
        new_feature.score = domain.bitscore

        new_feature.translation = gene.translation[domain.query_start:domain.query_end + 1]

        domain_counts[domain.hit_id] += 1  # 1-indexed, so increment before use
        domain_name = "{}_{}.{}".format(gene.get_name(), domain.hit_id, domain_counts[domain.hit_id])

        new_feature.domain_id = "nrpspksdomains_" + domain_name
        new_feature.label = domain_name

        new_features[domain] = new_feature
    return new_features
Exemplo n.º 3
0
 def test_simple_location_forward_complete(self):
     cds = CDSFeature(FeatureLocation(0, 15, 1),
                      locus_tag="simple",
                      translation="A")
     new = cds.get_sub_location_from_protein_coordinates(0, 5)
     extracted = new.extract(self.magic)
     assert extracted == self.magic
     assert extracted.translate() == self.translation
Exemplo n.º 4
0
    def test_compound_location_reverse_single(self):
        self.reverse_strand()
        cds = CDSFeature(self.location, locus_tag="compound", translation="A")

        new = cds.get_sub_location_from_protein_coordinates(0, 2)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 6
        assert new.start == 21
        assert new.end == 27
        assert new.extract(
            self.magic_split).translate() == self.translation[0:2]

        new = cds.get_sub_location_from_protein_coordinates(2, 3)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 3
        assert new.start == 12
        assert new.end == 15
        assert new.extract(
            self.magic_split).translate() == self.translation[2:3]
Exemplo n.º 5
0
 def test_compound_location_reverse_full(self):
     self.reverse_strand()
     cds = CDSFeature(self.location, locus_tag="compound", translation="A")
     new = cds.get_sub_location_from_protein_coordinates(0, 5)
     assert isinstance(new, CompoundLocation)
     assert len(new.parts) == 3
     print(list(map(str, cds.location.parts)))
     print(list(map(str, new.parts)))
     assert len(new) == len(cds.location)
     assert new.extract(
         self.magic_split).translate() == self.translation[0:5]
Exemplo n.º 6
0
 def test_simple_location_forward_partial(self):
     cds = CDSFeature(FeatureLocation(0, 15, 1),
                      locus_tag="simple",
                      translation="A")
     for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]:
         print("testing", start, end)
         new = cds.get_sub_location_from_protein_coordinates(start, end)
         print(new)
         extracted = new.extract(self.magic)
         assert extracted == self.magic[start * 3:end * 3]
         assert extracted.translate() == self.translation[start:end]
Exemplo n.º 7
0
    def test_compound_location_reverse_multiple(self):
        self.reverse_strand()
        cds = CDSFeature(self.location, locus_tag="compound", translation="A")

        new = cds.get_sub_location_from_protein_coordinates(2, 4)
        assert isinstance(new, CompoundLocation)
        print(list(map(str, cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new.parts) == 2
        assert len(new) == 6
        assert new.parts[0].start == 12
        assert new.parts[0].end == 15
        assert new.parts[1].start == 3
        assert new.parts[1].end == 6
        assert new.extract(
            self.magic_split).translate() == self.translation[2:4]
Exemplo n.º 8
0
    def test_frameshifted_location(self):
        location = CompoundLocation(
            [FeatureLocation(3, 9, 1),
             FeatureLocation(8, 14, 1)])
        assert len(location) == 12
        seq = Seq("ATGATGAGCCCTCGTCTAGACTACAATGA")
        extracted = location.extract(seq)
        assert extracted == "ATGAGCCCCTCG"
        assert len(extracted) == len(location)
        translation = extracted.translate()
        assert translation == "MSPS"

        cds = CDSFeature(location, locus_tag="test", translation=translation)
        new = cds.get_sub_location_from_protein_coordinates(1, 3)
        assert isinstance(new, CompoundLocation)
        assert len(new.parts) == 2
        assert new.start == 6
        assert new.end == 11
Exemplo n.º 9
0
 def test_complicated(self):
     parts = [
         FeatureLocation(121124, 122061, 1),
         FeatureLocation(122339, 122383, 1),
         FeatureLocation(122559, 122666, 1),
         FeatureLocation(122712, 122874, 1),
         FeatureLocation(123060, 123337, 1),
         FeatureLocation(123481, 123749, 1),
         FeatureLocation(123809, 124032, 1),
         FeatureLocation(124091, 124193, 1),
         FeatureLocation(124236, 124401, 1),
         FeatureLocation(124684, 124724, 1)
     ]
     location = CompoundLocation(parts, operator="join")
     cds = CDSFeature(location, locus_tag="complicated", translation="A")
     seq = (
         "ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT"
         "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT"
         "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT"
         "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT"
         "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT"
         "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC"
         "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG"
         "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC"
         "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC"
         "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA"
         "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA"
         "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT"
         "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG"
         "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA"
         "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC"
         "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT"
         "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC"
         "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC"
         "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG"
         "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC"
         "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA"
         "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG"
         "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA"
         "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG"
         "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC"
         "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT"
         "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG"
         "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT"
         "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT"
         "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG"
         "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG"
         "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA"
         "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC"
         "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG"
         "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA"
         "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG"
         "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG"
         "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC"
         "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG"
         "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA"
         "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA"
         "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG"
         "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC"
         "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT"
         "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA"
         "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA"
         "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA"
         "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG"
         "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG"
         "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA"
     )
     translation = (
         "MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR"
         "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ"
         "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR"
         "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV"
         "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR"
         "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF"
         "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI"
         "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI"
         "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP"
         "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG"
         "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL"
         "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT"
         "HTITLR")
     new = cds.get_sub_location_from_protein_coordinates(353, 412)
     # pad the beginning to match the location
     assert new.extract(Seq("x" * location.start +
                            seq)).translate() == translation[353:412]
Exemplo n.º 10
0
class TestCDSProteinLocation(unittest.TestCase):
    def setUp(self):
        self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT")
        self.magic = Seq("ATGGCAGGTATTTGT")
        self.translation = "MAGIC"
        self.sub_locations = [
            FeatureLocation(0, 6, strand=1),
            FeatureLocation(12, 15, strand=1),
            FeatureLocation(21, 27, strand=1)
        ]
        self.location = CompoundLocation(self.sub_locations)
        self.cds = CDSFeature(self.location,
                              locus_tag="compound",
                              translation="A")

    def reverse_strand(self):
        self.magic = self.magic.reverse_complement()
        self.magic_split = self.magic_split.reverse_complement()
        self.sub_locations = [
            FeatureLocation(loc.start, loc.end, strand=loc.strand * -1)
            for loc in self.sub_locations
        ]
        self.location = CompoundLocation(
            self.sub_locations[::self.sub_locations[0].strand])
        self.cds = CDSFeature(self.location,
                              locus_tag="compound",
                              translation="A")

    def test_simple_location_forward_complete(self):
        cds = CDSFeature(FeatureLocation(0, 15, 1),
                         locus_tag="simple",
                         translation="A")
        new = cds.get_sub_location_from_protein_coordinates(0, 5)
        extracted = new.extract(self.magic)
        assert extracted == self.magic
        assert extracted.translate() == self.translation

    def test_simple_location_forward_partial(self):
        cds = CDSFeature(FeatureLocation(0, 15, 1),
                         locus_tag="simple",
                         translation="A")
        for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]:
            print("testing", start, end)
            new = cds.get_sub_location_from_protein_coordinates(start, end)
            print(new)
            extracted = new.extract(self.magic)
            assert extracted == self.magic[start * 3:end * 3]
            assert extracted.translate() == self.translation[start:end]

    def test_compound_location_forward_full(self):
        new = self.cds.get_sub_location_from_protein_coordinates(0, 5)
        assert isinstance(new, CompoundLocation)
        assert len(new.parts) == 3
        print(list(map(str, self.cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new) == len(self.cds.location)
        assert new == self.location, "%s != %s" % (str(new), str(
            self.location))
        extracted = new.extract(self.magic_split)
        assert extracted == self.magic
        assert extracted.translate() == self.translation[0:5]

    def test_compound_forward_within_single(self):
        new = self.cds.get_sub_location_from_protein_coordinates(0, 2)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 6
        assert new.start == 0
        assert new.end == 6
        assert new.extract(
            self.magic_split).translate() == self.translation[0:2]

        new = self.cds.get_sub_location_from_protein_coordinates(2, 3)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 3
        assert new.start == 12
        assert new.end == 15
        assert new.extract(
            self.magic_split).translate() == self.translation[2:3]

    def test_compound_forward_over_multiple(self):
        new = self.cds.get_sub_location_from_protein_coordinates(2, 4)
        assert isinstance(new, CompoundLocation)
        print(list(map(str, self.cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new.parts) == 2
        assert len(new) == 6
        assert new.parts[0].start == 12
        assert new.parts[0].end == 15
        assert new.parts[1].start == 21
        assert new.parts[1].end == 24
        assert new.extract(
            self.magic_split).translate() == self.translation[2:4]

    def test_compound_location_reverse_full(self):
        self.reverse_strand()
        cds = CDSFeature(self.location, locus_tag="compound", translation="A")
        new = cds.get_sub_location_from_protein_coordinates(0, 5)
        assert isinstance(new, CompoundLocation)
        assert len(new.parts) == 3
        print(list(map(str, cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new) == len(cds.location)
        assert new.extract(
            self.magic_split).translate() == self.translation[0:5]

    def test_compound_location_reverse_single(self):
        self.reverse_strand()
        cds = CDSFeature(self.location, locus_tag="compound", translation="A")

        new = cds.get_sub_location_from_protein_coordinates(0, 2)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 6
        assert new.start == 21
        assert new.end == 27
        assert new.extract(
            self.magic_split).translate() == self.translation[0:2]

        new = cds.get_sub_location_from_protein_coordinates(2, 3)
        assert isinstance(new, FeatureLocation)
        assert len(new) == 3
        assert new.start == 12
        assert new.end == 15
        assert new.extract(
            self.magic_split).translate() == self.translation[2:3]

    def test_compound_location_reverse_multiple(self):
        self.reverse_strand()
        cds = CDSFeature(self.location, locus_tag="compound", translation="A")

        new = cds.get_sub_location_from_protein_coordinates(2, 4)
        assert isinstance(new, CompoundLocation)
        print(list(map(str, cds.location.parts)))
        print(list(map(str, new.parts)))
        assert len(new.parts) == 2
        assert len(new) == 6
        assert new.parts[0].start == 12
        assert new.parts[0].end == 15
        assert new.parts[1].start == 3
        assert new.parts[1].end == 6
        assert new.extract(
            self.magic_split).translate() == self.translation[2:4]

    def test_frameshifted_location(self):
        location = CompoundLocation(
            [FeatureLocation(3, 9, 1),
             FeatureLocation(8, 14, 1)])
        assert len(location) == 12
        seq = Seq("ATGATGAGCCCTCGTCTAGACTACAATGA")
        extracted = location.extract(seq)
        assert extracted == "ATGAGCCCCTCG"
        assert len(extracted) == len(location)
        translation = extracted.translate()
        assert translation == "MSPS"

        cds = CDSFeature(location, locus_tag="test", translation=translation)
        new = cds.get_sub_location_from_protein_coordinates(1, 3)
        assert isinstance(new, CompoundLocation)
        assert len(new.parts) == 2
        assert new.start == 6
        assert new.end == 11

    def test_complicated(self):
        parts = [
            FeatureLocation(121124, 122061, 1),
            FeatureLocation(122339, 122383, 1),
            FeatureLocation(122559, 122666, 1),
            FeatureLocation(122712, 122874, 1),
            FeatureLocation(123060, 123337, 1),
            FeatureLocation(123481, 123749, 1),
            FeatureLocation(123809, 124032, 1),
            FeatureLocation(124091, 124193, 1),
            FeatureLocation(124236, 124401, 1),
            FeatureLocation(124684, 124724, 1)
        ]
        location = CompoundLocation(parts, operator="join")
        cds = CDSFeature(location, locus_tag="complicated", translation="A")
        seq = (
            "ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT"
            "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT"
            "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT"
            "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT"
            "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT"
            "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC"
            "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG"
            "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC"
            "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC"
            "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA"
            "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA"
            "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT"
            "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG"
            "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA"
            "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC"
            "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT"
            "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC"
            "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC"
            "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG"
            "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC"
            "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA"
            "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG"
            "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA"
            "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG"
            "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC"
            "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT"
            "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG"
            "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT"
            "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT"
            "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG"
            "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG"
            "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA"
            "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC"
            "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG"
            "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA"
            "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG"
            "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG"
            "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC"
            "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG"
            "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA"
            "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA"
            "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG"
            "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC"
            "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT"
            "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA"
            "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA"
            "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA"
            "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG"
            "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG"
            "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA"
        )
        translation = (
            "MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR"
            "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ"
            "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR"
            "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV"
            "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR"
            "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF"
            "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI"
            "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI"
            "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP"
            "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG"
            "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL"
            "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT"
            "HTITLR")
        new = cds.get_sub_location_from_protein_coordinates(353, 412)
        # pad the beginning to match the location
        assert new.extract(Seq("x" * location.start +
                               seq)).translate() == translation[353:412]

    def test_extends_past_after(self):
        self.sub_locations[-1] = FeatureLocation(21,
                                                 AfterPosition(29),
                                                 strand=1)
        self.cds.location = CompoundLocation(self.sub_locations)

        new = self.cds.get_sub_location_from_protein_coordinates(0, 7)
        assert new.end == 27

    def test_extends_past_before(self):
        self.reverse_strand()
        self.sub_locations[0] = FeatureLocation(BeforePosition(2),
                                                self.sub_locations[0].end,
                                                strand=-1)
        self.cds.location = CompoundLocation(self.sub_locations[::-1])
        new = self.cds.get_sub_location_from_protein_coordinates(0, 7)
        assert new.start == 3