def runMafftAlignmentWithMatrix(inputSequences, gap_open, gap_ext, method="globalpair", matrixfile=""): """ Run MAFFT alignment on a group of sequences with a substitution matrix. This function requires a proper path to the mafft binary. Args: inputSequences (array): an array of strings of sequences to be aligned gap_open (float): gap open penalty gap_ext (float): gap extent penalty method (str): MAFFT's alignment strategy matrixfile (str): path to an appropriate matrix file for MAFFT Returns: output (array): aligned strings of sequences """ tempfile = "temp/tempseq.fasta" tempfileout = "temp/tempmsa.fasta" tempfileout2 = "temp/tempmsa2.fasta" io.exportGroupOfSequencesToFASTA(inputSequences, tempfile) command = "mafft-mac/mafft.bat --quiet --op " + str( gap_open ) + " --ep " + str( gap_ext ) + " --" + method + " --maxiterate 1000 --aamatrix " + matrixfile + " " + tempfile + " > " + tempfileout2 os.system(command) output = io.readFASTA(tempfileout2) return output
def runMafftAlignmentWithSettings(inputSequences, gap_open, gap_ext, method="globalpair", allowshift=False): """ Run MAFFT alignment on a group of sequences without a substitution matrix. This function requires a proper path to the mafft binary. Args: inputSequences (array): an array of strings of sequences to be aligned gap_open (float): gap open penalty gap_ext (float): gap extent penalty method (str): MAFFT's alignment strategy allowshift (bool): see MAFFT documentation Returns: output (array): aligned strings of sequences """ tempfile = "temp/tempseq.fasta" tempfileout = "temp/tempmsa.fasta" tempfileout2 = "temp/tempmsa2.fasta" io.exportGroupOfSequencesToFASTA(inputSequences, tempfile) if allowshift == True: command = "mafft-mac/mafft.bat --quiet --op " + str( gap_open ) + " --ep " + str( gap_ext ) + " --" + method + " --maxiterate 1000 --allowshift --text " + tempfile + " > " + tempfileout2 else: command = "mafft-mac/mafft.bat --quiet --op " + str( gap_open ) + " --ep " + str( gap_ext ) + " --" + method + " --maxiterate 1000 --text " + tempfile + " > " + tempfileout2 os.system(command) output = io.readFASTA(tempfileout2) return output
def tcoffeeAlignment(sequences, go, ge): """ The standard T-COFFEE alignment. For more info: http://www.tcoffee.org/Projects/tcoffee/#DOCUMENTATION Args: sequences (array): array of strings to be aligned go (int): gap open penalty ge (int): gap extent penalty Returns: output (array): aligned sequences """ io.exportGroupOfSequencesToFasta(sequences, "temp/tcoffee-in.fasta") command = "./t_coffee 'temp/tcoffee-in.fasta' -quiet -output fasta -outfile 'temp/tcoffee-out.fasta'" os.system(command) output = io.readFASTA("temp/tcoffee-out.fasta") return output
def tcoffeeAlignmentWithMatrix(sequences, go, ge, matrixfile): """ The standard T-COFFEE alignment with custom matrix For more info: http://www.tcoffee.org/Projects/tcoffee/#DOCUMENTATION Args: sequences (array): array of strings to be aligned go (int): gap open penalty ge (int): gap extent penalty matrixfile (str): file path to an appropriate matrix file Returns: output (array): aligned sequences """ io.exportGroupOfSequencesToFasta(sequences, "temp/tcoffee-in.fasta") command = "./t_coffee 'temp/tcoffee-in.fasta' -in = X" + matrixfile + " -gapopen = " + str( go) + " -gapext = " + str( ge) + " -quiet -output fasta -outfile 'temp/tcoffee-out.fasta'" os.system(command) output = io.readFASTA("temp/tcoffee-out.fasta") return output
# align them using MAFFT MSA = al.runMafftAlignmentWithSettings(sequences, 2, 1, method="globalpair", allowshift=False) print "Aligned sequences:" al.printMSA(MSA) # Create a Krogh profile HMM from a folk tune family # First we need the emission probabilities gathered from the whole dataset directory = "NotAligned/NLBproperSmall" files = io.filesInPath(directory) allPossibleSymbols = "".join([ "".join(["".join(seq) for seq in io.readFASTA(directory + "/" + file)]) for file in files ]).replace("-", "") counts = collections.Counter(allPossibleSymbols) # count alphabet = [symbol for symbol in counts] # all possible symbols counts = np.array([counts[symbol] for symbol in counts]) # corresponding counts EmissionProbabilities = dict(zip(alphabet, counts / np.sum(counts))) # convert to dictionary # Bake the Krogh profile HMM model sw = 10 pct = 1 pec = 1 / 12. grms = 0.95 model, columnsIsMatchState = krogh.profileHMM(