def test_indv_blk(self): """tests that indv_block can pull out individual fasta sequences with their identifying line""" tester = constool.indv_block( """>OAP01791.1 CDC48A [Arabidopsis thaliana] MSTPAESSDSKSKKDFSTAILERKKSPNRLVVDEAINDDNSVVSLHPATMEKLQLFRGDTILIKGKKRKD TVCIALADETCEEPKIRMNKVVRSNLRVRLGDVISVHQCPDVKYGKRVHILPVDDTVEGVTGNLFDAYLK""" ) self.assertTrue( '>OAP01791.1 CDC48A [Arabidopsis thaliana]' in tester[0]) self.assertTrue( 'SKKDFSTAILERKKSPNRLVVDEAINDDNSVVSLHPATMEKLQL' in tester[0])
def test_indv_blk_multi(self): """tests that indv_block can pull out individual fasta sequences from a string with multiple""" tester = constool.indv_block( ">PROCA12070 | ENSPCAG00000012030 | HOG:0377891.2a.2a | [Procavia capensis]\n" "MKTRQNKDSMSMRSGRKKEAPGPREELRSRGRASPGGVSTSSSDGKAEKSRQTAKKARVEEVSAPKVSKQGRGEEISESE\n" ">LOXAF14113 | G3TAL7 | HOG:0377891.2a.2a | [Loxodonta africana]\n" "MKTRQNKDSMSMRSGRKKEAPGPREELRSRGRASPGGVSTSSSDGKAEKSRQTA\n" ">ECHTE02547 | ENSETEG00000016682 | HOG:0377891.2a.2a | [Echinops telfairi]\n" "MKTRQNKDSMSMRSGRKKEAPGPREELRS") self.assertEqual(len(tester), 3) self.assertEqual(tester[1], ( ">LOXAF14113 | G3TAL7 | HOG:0377891.2a.2a | [Loxodonta africana]\n" "MKTRQNKDSMSMRSGRKKEAPGPREELRSRGRASPGGVSTSSSDGKAEKSRQTA"))
def find_motif(self, msa, motif): """ Searches a multiple sequence alignment for the given motif- if found, it returns the index at which the motif was found. Note: The indexing refers to the consensus sequence, or to each individual protein, as they are all the same length Args: msa (str): the path to the multiple sequence alignment motif (str): the motif to be located in the msa Returns: The index at which the motif can be found """ with open(msa, "r") as file: msa = file.read() msa_list = constool.indv_block(msa) index = -1 for prot in msa_list: prot = constool.seqnwl_strip(prot) prot = constool.get_fasta_sequence(prot) index = prot.find(motif) if not index == -1: break return index