def test_seqnwl_strip_messy(self): """Tests that seqnwl_strip removes the newlines from within the fasta string""" tester = constool.seqnwl_strip( ">PROCA12070 | ENSPCAG00000012030 | HOG:0377891.2a.2a | [Procavia capensis]\n" "MKTRQNK\nDSMSMRSGRKKEAPGPREEL\nRSRGRASPGGVSTSSSDGKAEKSRQTAK\nKARVEEVSAPKVSKQGRGEEIS\nESE\n" ) self.assertEqual( tester, ">PROCA12070 | ENSPCAG00000012030 | HOG:0377891.2a.2a | [Procavia capensis]\n" "MKTRQNKDSMSMRSGRKKEAPGPREELRSRGRASPGGVSTSSSDGKAEKSRQTAKKARVEEVSAPKVSKQGRGEEISESE" )
def get_orthologs(self): """ Retrieves a fasta file containing the sequences of the orthologous proteins, based on the input parameters """ if not self.fasta: raise SequenceError("Input sequence is empty!") self.sequence = constool.get_fasta_sequence(fasta=self.fasta) if self.has_run: output = self.orthologs else: self.retrieve_OMAid() output = self.ortholog_to_fasta() output = constool.remove_first_protein(output) output = constool.seqnwl_strip(self.sequence) + os.linesep + output self.has_run = True return output
def find_motif(self, msa, motif): """ Searches a multiple sequence alignment for the given motif- if found, it returns the index at which the motif was found. Note: The indexing refers to the consensus sequence, or to each individual protein, as they are all the same length Args: msa (str): the path to the multiple sequence alignment motif (str): the motif to be located in the msa Returns: The index at which the motif can be found """ with open(msa, "r") as file: msa = file.read() msa_list = constool.indv_block(msa) index = -1 for prot in msa_list: prot = constool.seqnwl_strip(prot) prot = constool.get_fasta_sequence(prot) index = prot.find(motif) if not index == -1: break return index