def test_cds_function(self): cds = CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo") # default value assert cds.gene_functions.get_classification() == GeneFunction.OTHER assert cds.gene_function == GeneFunction.OTHER # check bad values can't be assigned with self.assertRaises(AssertionError): cds.gene_functions.add("other", "a", "b") with self.assertRaises(AttributeError): cds.gene_functions = 0 cds.gene_functions.add(GeneFunction.ADDITIONAL, "first_tool", "dummy") assert cds.gene_functions.get_classification() == GeneFunction.ADDITIONAL assert cds.gene_function == GeneFunction.ADDITIONAL # conflicting, so back to OTHER cds.gene_functions.add(GeneFunction.TRANSPORT, "other_tool", "dummy") assert cds.gene_functions.get_classification() == GeneFunction.OTHER assert cds.gene_function == GeneFunction.OTHER # but smcogs overrides that cds.gene_functions.add(GeneFunction.REGULATORY, "smcogs", "dummy") assert cds.gene_functions.get_classification() == GeneFunction.REGULATORY # and cluster definition overrides even that cds.gene_functions.add(GeneFunction.CORE, "cluster_definition", "dummy") assert cds.gene_functions.get_classification() == GeneFunction.CORE # and that we still have tracked these smcogs = cds.gene_functions.get_by_tool("smcogs") assert len(smcogs) == 1 assert smcogs[0].function == GeneFunction.REGULATORY adds = cds.gene_functions.get_by_function(GeneFunction.ADDITIONAL) assert len(adds) == 1 assert adds[0].tool == "first_tool"
def setUp(self): self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT") self.magic = Seq("ATGGCAGGTATTTGT") self.translation = "MAGIC" self.sub_locations = [FeatureLocation(0, 6, strand=1), FeatureLocation(12, 15, strand=1), FeatureLocation(21, 27, strand=1)] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound")
def test_simple_location_forward_partial(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple") for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]: print("testing", start, end) new = cds.get_sub_location_from_protein_coordinates(start, end) print(new) extracted = new.extract(self.magic) assert extracted == self.magic[start * 3: end * 3] assert extracted.translate() == self.translation[start:end]
def test_compound_location_reverse_full(self): self.reverse_strand() for direction in [1, -1]: print("sub locations sorted in direction:", direction) cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound") new = cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(cds.location) assert new.extract(self.magic_split).translate() == self.translation[0:5]
def setUp(self): self.config = build_config([ "--cf-create-clusters", "--cf-mean-threshold", "0.6", "--cf-min-cds", "5", "--cf-min-pfams", "5" ], modules=[clusterfinder], isolated=True) update_config({"enabled_cluster_types": []}) self.record = DummyRecord(seq=Seq("A" * 2000)) for start, end, probability, pfam_id in [(10, 20, 0.1, 'FAKE007'), (30, 40, 0.3, 'PF00106'), (50, 60, 0.4, 'PF00107'), (60, 70, 0.7, 'PF00109'), (70, 80, 0.98, 'PF08484'), (90, 100, 0.8, 'PF02401'), (100, 110, 0.32, 'PF04369'), (110, 120, 1.0, 'PF00128'), (130, 140, 0.2, 'FAKE234'), (500, 505, None, 'FAKE505'), (1010, 1020, 0.1, 'FAKE007'), (1030, 1040, 0.3, 'PF00106'), (1050, 1060, 0.4, 'PF00107'), (1060, 1070, 0.7, 'PF00109'), (1070, 1080, 0.98, 'PF08484'), (1090, 1100, 0.8, 'PF02401'), (1100, 1110, 0.32, 'PF04369'), (1110, 1120, 1.0, 'PF00128')]: location = FeatureLocation(start, end) self.record.add_cds_feature( CDSFeature(location, locus_tag=str(start))) pfam = PFAMDomain(location, "dummy_description") pfam.db_xref.append(pfam_id) pfam.probability = probability self.record.add_pfam_domain(pfam)
def test_compound_location_reverse_multiple(self): self.reverse_strand() for direction in [1, -1]: print("sub locations sorted in direction:", direction) cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound") new = cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 3 assert new.parts[1].end == 6 assert new.extract(self.magic_split).translate() == self.translation[2:4]
def test_classification_with_colon(self): # since SMCOG id and description are stored in a string separated by :, # ensure that descriptions containing : are properly handled cds = CDSFeature(FeatureLocation(0, 100), locus_tag="test", translation="AAA") record = helpers.DummyRecord(features=[cds], seq="A" * 100) record.add_cluster(helpers.DummyCluster(0, 100)) results = SMCOGResults(record.id) results.best_hits[cds.get_name()] = HMMResult( "SMCOG1212:sodium:dicarboxylate_symporter", 0, 100, 2.3e-126, 416) results.add_to_record(record) gene_functions = cds.gene_functions.get_by_tool("smcogs") assert len(gene_functions) == 1 assert str(gene_functions[0]).startswith( "transport (smcogs) SMCOG1212:sodium:dicarboxylate_symporter" " (Score: 416; E-value: 2.3e-126)")
def test_compound_location_reverse_single(self): self.reverse_strand() for direction in [1, -1]: print("sub locations sorted in direction:", direction) cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound") new = cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 21 assert new.end == 27 assert new.extract(self.magic_split).translate() == self.translation[0:2] new = cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract(self.magic_split).translate() == self.translation[2:3]
def test_cds_function_conversion(self): cds = CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo") assert cds.gene_function == GeneFunction.OTHER assert CDSFeature.from_biopython(cds.to_biopython()[0]).gene_function == GeneFunction.OTHER cds.gene_functions.add(GeneFunction.ADDITIONAL, "tool", "desc") assert cds.gene_function == GeneFunction.ADDITIONAL assert CDSFeature.from_biopython(cds.to_biopython()[0]).gene_function == GeneFunction.ADDITIONAL
def test_membership(self): location = FeatureLocation(0, 1, strand=1) # Features don't have locus tags with self.assertRaises(AttributeError): Feature(location, feature_type="none").locus_tag = "something" # CDSFeatures don't have an 'other_value' with self.assertRaises(AttributeError): CDSFeature(location, translation="none", gene="a").other_value = 1 cluster = Cluster(location, 0, 0, products=["a", "b"]) assert cluster.products == ("a", "b") # Clusters have products, not product with self.assertRaises(AttributeError): cluster.product = ["c", "d"]
def generate_motif_features(record: Record, feature: CDSFeature, motifs: List[HMMResult]) -> List[CDSMotif]: """ Convert a list of HMMResult to a list of CDSMotif features """ # use a locus tag if one exists locus_tag = feature.get_name() if feature.locus_tag: locus_tag = feature.locus_tag # grab the translation table if it's there if feature.transl_table: transl_table = feature.transl_table else: transl_table = 1 motif_features = [] for i, motif in enumerate(motifs): i += 1 # user facing, so 1-indexed loc = feature.get_sub_location_from_protein_coordinates( motif.query_start, motif.query_end) new_motif = CDSMotif(loc) new_motif.label = motif.hit_id new_motif.motif = motif.hit_id # TODO: why both label AND motif? new_motif.domain_id = 'nrpspksmotif_{}_{:04d}'.format(locus_tag, i) new_motif.evalue = motif.evalue new_motif.score = motif.bitscore new_motif.tool = "pksnrpsmotif" new_motif.detection = "hmmscan" new_motif.database = "abmotifs" new_motif.locus_tag = locus_tag new_motif.translation = str( new_motif.extract(record.seq).translate(table=transl_table)) new_motif.notes.append( "NRPS/PKS Motif: %s (e-value: %s, bit-score: %s)" % (motif.hit_id, motif.evalue, motif.bitscore)) # TODO move to CDSMotif motif_features.append(new_motif) return motif_features
def test_required_identifiers(self): with self.assertRaises(ValueError): CDSFeature(FeatureLocation(1, 5, 1)) assert CDSFeature(FeatureLocation(1, 5, 1), locus_tag="foo") assert CDSFeature(FeatureLocation(1, 5, 1), protein_id="foo") assert CDSFeature(FeatureLocation(1, 5, 1), gene="foo")
def test_complicated(self): parts = [FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1), FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1), FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1), FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1), FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1)] location = CompoundLocation(parts, operator="join") cds = CDSFeature(location, locus_tag="complicated") seq = ("ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT" "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT" "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT" "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT" "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT" "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC" "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG" "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC" "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC" "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA" "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA" "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT" "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG" "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA" "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC" "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT" "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC" "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC" "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG" "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC" "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA" "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG" "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA" "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG" "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC" "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT" "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG" "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT" "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT" "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG" "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG" "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA" "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC" "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG" "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA" "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG" "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG" "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC" "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG" "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA" "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA" "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG" "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC" "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT" "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA" "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA" "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA" "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG" "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG" "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA") translation = ("MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR" "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ" "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR" "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV" "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR" "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF" "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI" "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI" "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP" "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG" "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL" "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT" "HTITLR") new = cds.get_sub_location_from_protein_coordinates(353, 412) # pad the beginning to match the location assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412]
def generate_domain_features( record: Record, gene: CDSFeature, domains: List[HMMResult]) -> Dict[HMMResult, AntismashDomain]: """ Generates AntismashDomain features for each provided HMMResult Arguments: record: the record the new features will belong to gene: the CDSFeature the domains were found in domains: a list of HMMResults found in the CDSFeature Returns: a dictionary mapping the HMMResult used to the matching AntismashDomain """ new_features = {} nrat = 0 nra = 0 nrcal = 0 nrkr = 0 nrXdom = 0 for domain in domains: loc = gene.get_sub_location_from_protein_coordinates( domain.query_start, domain.query_end) # set up new feature new_feature = AntismashDomain(loc) new_feature.domain = domain.hit_id new_feature.locus_tag = gene.locus_tag new_feature.detection = "hmmscan" new_feature.database = "nrpspksdomains.hmm" new_feature.evalue = domain.evalue new_feature.score = domain.bitscore transl_table = gene.transl_table or 1 new_feature.translation = str( new_feature.extract(record.seq).translate(table=transl_table)) if domain.hit_id == "AMP-binding": nra += 1 domainname = "{}_A{}".format(gene.get_name(), nra) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname elif domain.hit_id == "PKS_AT": nrat += 1 domainname = "{}_AT{}".format(gene.get_name(), nrat) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname elif domain.hit_id == "CAL_domain": nrcal += 1 domainname = gene.get_name() + "_CAL" + str(nrcal) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname elif domain.hit_id == "PKS_KR": nrkr += 1 domainname = gene.get_name() + "_KR" + str(nrkr) new_feature.label = domainname new_feature.domain_id = "nrpspksdomains_" + domainname else: nrXdom += 1 new_feature.domain_id = "nrpspksdomains_" + gene.get_name( ).partition(".")[0] + "_Xdom" + '{:02d}'.format(nrXdom) assert new_feature.get_name() not in new_features new_features[domain] = new_feature return new_features
def test_simple_location_forward_complete(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple") new = cds.get_sub_location_from_protein_coordinates(0, 5) extracted = new.extract(self.magic) assert extracted == self.magic assert extracted.translate() == self.translation
def reverse_strand(self): self.magic = self.magic.reverse_complement() self.magic_split = self.magic_split.reverse_complement() self.sub_locations = [FeatureLocation(loc.start, loc.end, strand=-1) for loc in self.sub_locations] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound")
def create_cds(self, start, end, strand=1): return CDSFeature(FeatureLocation(start, end, strand), locus_tag="%d-%d" % (start, end))
class TestCDSProteinLocation(unittest.TestCase): def setUp(self): self.magic_split = Seq("ATGGCAxxxxxxGGTxxxxxxATTTGT") self.magic = Seq("ATGGCAGGTATTTGT") self.translation = "MAGIC" self.sub_locations = [FeatureLocation(0, 6, strand=1), FeatureLocation(12, 15, strand=1), FeatureLocation(21, 27, strand=1)] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound") def reverse_strand(self): self.magic = self.magic.reverse_complement() self.magic_split = self.magic_split.reverse_complement() self.sub_locations = [FeatureLocation(loc.start, loc.end, strand=-1) for loc in self.sub_locations] self.location = CompoundLocation(self.sub_locations) self.cds = CDSFeature(self.location, locus_tag="compound") def test_simple_location_forward_complete(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple") new = cds.get_sub_location_from_protein_coordinates(0, 5) extracted = new.extract(self.magic) assert extracted == self.magic assert extracted.translate() == self.translation def test_simple_location_forward_partial(self): cds = CDSFeature(FeatureLocation(0, 15, 1), locus_tag="simple") for start, end in [(1, 5), (0, 3), (2, 3), (1, 4)]: print("testing", start, end) new = cds.get_sub_location_from_protein_coordinates(start, end) print(new) extracted = new.extract(self.magic) assert extracted == self.magic[start * 3: end * 3] assert extracted.translate() == self.translation[start:end] def test_compound_location_forward_full(self): new = self.cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, self.cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(self.cds.location) assert new == self.location, "%s != %s" % (str(new), str(self.location)) extracted = new.extract(self.magic_split) assert extracted == self.magic assert extracted.translate() == self.translation[0:5] def test_compound_forward_within_single(self): new = self.cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 0 assert new.end == 6 assert new.extract(self.magic_split).translate() == self.translation[0:2] new = self.cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract(self.magic_split).translate() == self.translation[2:3] def test_compound_forward_over_multiple(self): new = self.cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, self.cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 21 assert new.parts[1].end == 24 assert new.extract(self.magic_split).translate() == self.translation[2:4] def test_compound_location_reverse_full(self): self.reverse_strand() for direction in [1, -1]: print("sub locations sorted in direction:", direction) cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound") new = cds.get_sub_location_from_protein_coordinates(0, 5) assert isinstance(new, CompoundLocation) assert len(new.parts) == 3 print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new) == len(cds.location) assert new.extract(self.magic_split).translate() == self.translation[0:5] def test_compound_location_reverse_single(self): self.reverse_strand() for direction in [1, -1]: print("sub locations sorted in direction:", direction) cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound") new = cds.get_sub_location_from_protein_coordinates(0, 2) assert isinstance(new, FeatureLocation) assert len(new) == 6 assert new.start == 21 assert new.end == 27 assert new.extract(self.magic_split).translate() == self.translation[0:2] new = cds.get_sub_location_from_protein_coordinates(2, 3) assert isinstance(new, FeatureLocation) assert len(new) == 3 assert new.start == 12 assert new.end == 15 assert new.extract(self.magic_split).translate() == self.translation[2:3] def test_compound_location_reverse_multiple(self): self.reverse_strand() for direction in [1, -1]: print("sub locations sorted in direction:", direction) cds = CDSFeature(CompoundLocation(self.sub_locations[::direction]), locus_tag="compound") new = cds.get_sub_location_from_protein_coordinates(2, 4) assert isinstance(new, CompoundLocation) print(list(map(str, cds.location.parts))) print(list(map(str, new.parts))) assert len(new.parts) == 2 assert len(new) == 6 assert new.parts[0].start == 12 assert new.parts[0].end == 15 assert new.parts[1].start == 3 assert new.parts[1].end == 6 assert new.extract(self.magic_split).translate() == self.translation[2:4] def test_complicated(self): parts = [FeatureLocation(121124, 122061, 1), FeatureLocation(122339, 122383, 1), FeatureLocation(122559, 122666, 1), FeatureLocation(122712, 122874, 1), FeatureLocation(123060, 123337, 1), FeatureLocation(123481, 123749, 1), FeatureLocation(123809, 124032, 1), FeatureLocation(124091, 124193, 1), FeatureLocation(124236, 124401, 1), FeatureLocation(124684, 124724, 1)] location = CompoundLocation(parts, operator="join") cds = CDSFeature(location, locus_tag="complicated") seq = ("ATGAGCCCTCGTCTAGACTACAATGAAGGATACGATTCCGAAGACGAGGAGATCCCCCGTTACGTACACCAT" "TCTAGAGGAAAGAGTCATAGATCCGTGAGGACGTCAGGTCGCTCACGCACGTTGGATTACGACGGGGATGAT" "GAAGCTAGTGACCACGCTGCCCCCTCCGGGATTGATCGGGACGCTCGAGCCTGTCCAACATCTCGCAGATAT" "ACTGATGACTGCCTTGAGACACATAAATTTCGAGGTGCCCGCTCCTCTCGCTCCCGTGGACGAACCGATGAT" "AACAAGGTTTTGTACTACACCAAGTATCGCAGCCCGGCTAAGGACTTGCCTATCGAGCGTGATCCCGAGGGT" "ATTAATTTATTCAAGGTCCGACAGCACACACGGCCAAGTGACGCTCATGTGCCCAGTGGATACCGTGAGCCC" "TACGAAGTCAAGGTCGACGAGTATGAGGATGATCATCCCCGTACATGCACTAGCCGCCGTGACTCTAGACAG" "CCGAAAGTCTACAAGGTCCGGGTTGATGAGTACGAGGATAACCTCCCTGCACGCTCTCACACTGACTTTCGC" "GAGTCTCCACGGTCTGAAAGATGCTCTAGCCGCTACACCGAGGACTCGAAGCCTGGGGAGCTTCCTCCCCGC" "TCAGGGCCCTGTCGGTCCAGCAGGCCTTCTCCGGTCGATGAGGACGTCGAGTATGAGATCCGTGAGCCCCGA" "GGGCATCGCTCCAGTCGACACTCTACAGATGTTGACTTTCAGCCAGTAGAACAACATCCTCGCTTTGGACAA" "CGTGGACTCAGCAGACCTTCGCGGGTTGATGAGGAAGTCGATTATGAGATCCGTGAGCCCCGTGGCAATCGT" "GTCAGTCACGCTGCTCATGGTGACAGCCCCTGTCAGGACCAAAGCTCCAGGCATATCGGCATTCAATTGTGG" "AGTACGCGCGGACCCCGGGCGGCTGGCCGTGGCCGGGGTCCTGATGAGTCTGACGATGTTGAGCCCTAGGCA" "GGGAATTGCCGTAATGCTCTTCAAACTGTATAGCAAGCTCAGCATCAATTCTTTAACTGGCAGGCGCTCTGC" "TCGCGCGTTTCTCTCTTGGGGTGGTTGGTTTGACTGTAGATTTCCTCTTTCAAGGCTTCTAGATACACCTTT" "GGAAGATAGCAACGCTATGCAAGATATTTTTGATAATTCAAATCCTTTTTACACATGGAATAGCTGGTGTTC" "CTGTTTTATCTAGGCAATTGACCCACGCCATCTCGGTAGGTACGGTAAAAGCAAGCCGTAATCTCGTATGGC" "TTCATCCTTAGCATCGTATAGATCTCCACTCGGGACTCGGCCAGGGATCTTCCATCAATCAACGTGAAGAAG" "TCCAGCACCCCGCTGAATCATAATATCCTACCGATTCTGCTCTCTTCACCTCTAGATACCCCTCTAGACTCC" "TGTCAACATGTTCCGTACAGTCGAAGACCGCCCGACCCCAAAAGAGGTATATAACTGGCGGCTGTACACCGA" "GGCCACCATCATTGCCACTGGTACACTCTTGTGAGTAGGTGCTGTTGTAACGAAAAACATCCAACTGATCCG" "CCAGGTTCGGCTATGACTCGGCTTTTGTGGGAACTACCATTGCCCGCCAAAGCTTCGTTGATGCCTTCAACA" "TCGTCGAGTCGGAGGCGGCGGATATTTCAAGCAATATCACGTCAACCTTTCAGGCCGGCGCATTTTTCGGCG" "CCATCTTCTGCTTCTTGCCTGAGTGAAGCCGTTAGAGACGGTCTCACTGGCTAACCGGACCAAGTGACCGAC" "AAAATTGGGCGTAAATGGGCCCTTCAGGCAAACACACTGCTGTTTCTTATTGGCGCGATTGTGATGACGGCT" "GCAACACATCACCTTTCCTATATATGTAAGTCATATCCCCGTAGTAGTCAAGGTTGTTAACTAGAGCAGATG" "CTGGACGAGCTCTCACCGGCATCGCATGCGGCGCTATCACCGCGACCGTCCCCAGCTATATTGCCGAGCTGT" "CAATCGTGTCGATCCGGGGCTTCCTCACCGGGTTCTTCGAAGTCGCATACCAGATTGGTAGCTTGGTTGGAT" "TCTGGATCAACTATGGCATTAACGAGAACATGGACAACTCCTCGGCCGCAAGCTGGAGAGTGCCTATGGCAG" "TCCAGATCATCCCCGCAGGAGTCCTTTTCATTGGTGGCTTTTCCTCCATGAGAGTCCTCTCTGGCTGATGCG" "AAAAGACAGTGAGGATGCCGCGACGGCTGCCCTGGAGGCGTTGAGGAAACTGCCACGGTCTCATCAATGTAA" "TCTCCCACCAAGACTCAGGACATAGTCCCATGCTGACTATTTTAGATGTCCAGGAAGACATCGAGATGAACC" "GCACCAGGCTGCTGGAGGAAGCTCGGATCGCCGAGAAGTACGGACAAGGTTGGTTGGCATATATCCGAGGCG" "CACTCTTCGAGCTCTCGCGCCATGGGATGTGGAATCGTGTTCTGCTCGTCCTCTGTGCCTTTGCACTGCAGA" "ATATGTCGGGAGCTGCTGCTATCAACTACTATTCCCCCATACTCTTTGCGTCGTTGGGGATCACTGATGTCG" "CTCTGTATACAGGTATTTATGGCCTGGTAAAAGGTAAGTTCTTCTCCTTAAGTATCTCTGGCTGACAATAGG" "GATTAACTGATGAGTTTACAGCCGTCGCATCAATTATATTCTACGGCATTCTCATTGATATGTGGGGCCGCC" "GACGTCCGACCATTGTTTCGTCACTGGCCTGCCCTCTATGTCTCTGGTTTGTGGGTGCATACGTCAAAGTTG" "GGCATCCAGCCGATATCATAGACGCCGGCGGGGAATTGTCCCCCTCCACGGAGGCTGGTGGTAGAGCGGCGA" "CTGCGATGATTATGATCTACTCCGTCTTGTAAGTGCCCCTCACTTTTGAATGGGCTTCAGCTTGGAACTCGA" "GTAACTGGTATCCAGTTGGTCTTTTGGTCTCAACGGTATCCCCTGGATTGTCTCCGCCGAAATCTTCCCCGG" "CGCGCTGCGAAATCTCACGGGGACATGGGCTGCGCTGGTGCAATGGTATGCAATTCCCTTCACCTAGTATCC" "ATATCTAAATCAGCAGGTTGATCCAATTCGTTATCACCAAAGCTCTCCCGTACATCTTCAATAGCCTTGGGT" "ACGGGACGTGGTTCTTCTTCGCCTCCTGGATGCTGCTCGCTATCATTTGGTCATTCTTTTTTCTCCCGGAAA" "CCAAGGGGAAGACTCTCGATGAAATGCATACGATCTTGTACGTTTCTCTCCGTCGAAATGTGGTCTTGGCTA" "ATGAATCAGCGGCCATTCTCTCGCCGAAGAGCAGGGTAAGGGTGAGGTTCGAGATAACACTACTAAAAGTGA" "TCGGGAGGCTGTCTAGTCCAGTAGTTCTAGAGGACTATTGGCTGGATGATTCCTCTGATGATTTTTGATTGG" "TGGTGAAAATGTTGGATGTTTAATGCCAATGTACTGGGAGAGAACATGCCGATAGTACATACCGCTGTGTTG" "TATATCGAAGACGGTTGATTTATATATCTTAGTCTTTCAAAAGACGGCACTCACACAATCACACTTCGATGA") translation = ("MSPRLDYNEGYDSEDEEIPRYVHHSRGKSHRSVRTSGRSRTLDYDGDDEASDHAAPSGIDRDAR" "ACPTSRRYTDDCLETHKFRGARSSRSRGRTDDNKVLYYTKYRSPAKDLPIERDPEGINLFKVRQ" "HTRPSDAHVPSGYREPYEVKVDEYEDDHPRTCTSRRDSRQPKVYKVRVDEYEDNLPARSHTDFR" "ESPRSERCSSRYTEDSKPGELPPRSGPCRSSRPSPVDEDVEYEIREPRGHRSSRHSTDVDFQPV" "EQHPRFGQRGLSRPSRVDEEVDYEIREPRGNRVSHAAHGDSPCQDQSSRHIGIQLWTGVPVLSR" "QLTHAISTPVNMFRTVEDRPTPKEVYNWRLYTEATIIATGTLLFGYDSAFVGTTIARQSFVDAF" "NIVESEAADISSNITSTFQAGAFFGAIFCFLPEADAGRALTGIACGAITATVPSYIAELSIVSI" "RGFLTGFFEVAYQIGSLVGFWINYGINENMDNSSAASWRVPMAVQIIPAGVLFIGGFSSMREDI" "EMNRTRLLEEARIAEKYGQGWLAYIRGALFELSRHGMWNRVLLVLCAFALQNMSGAAAINYYSP" "ILFASLGITDVALYTGIYGLVKAVASIIFYGILIDMWGRRRPTIVSSLACPLCLWFVGAYVKVG" "HPADIIDAGGELSPSTEAGGRAATAMIMIYSVFWSFGLNGIPWIVSAEIFPGALRNLTGTWAAL" "VQWLIQFVITKALPYIFNSLGYGTWFFFASWMLLAIIWSFFFLPETKGKTLDEMHTIFLSKDGT" "HTITLR") new = cds.get_sub_location_from_protein_coordinates(353, 412) # pad the beginning to match the location assert new.extract(Seq("x" * location.start + seq)).translate() == translation[353:412]