def runMafftAlignmentWithMatrix(inputSequences,
                                gap_open,
                                gap_ext,
                                method="globalpair",
                                matrixfile=""):
    """ Run MAFFT alignment on a group of sequences with a substitution matrix.

	This function requires a proper path to the mafft binary.

	Args:
		inputSequences (array): an array of strings of sequences to be aligned
		gap_open (float): gap open penalty
		gap_ext (float): gap extent penalty
		method (str): MAFFT's alignment strategy
		matrixfile (str): path to an appropriate matrix file for MAFFT

	Returns:
		output (array): aligned strings of sequences

	"""
    tempfile = "temp/tempseq.fasta"
    tempfileout = "temp/tempmsa.fasta"
    tempfileout2 = "temp/tempmsa2.fasta"

    io.exportGroupOfSequencesToFASTA(inputSequences, tempfile)
    command = "mafft-mac/mafft.bat --quiet  --op " + str(
        gap_open
    ) + " --ep " + str(
        gap_ext
    ) + " --" + method + " --maxiterate 1000 --aamatrix " + matrixfile + " " + tempfile + "		> " + tempfileout2
    os.system(command)

    output = io.readFASTA(tempfileout2)
    return output
def runMafftAlignmentWithSettings(inputSequences,
                                  gap_open,
                                  gap_ext,
                                  method="globalpair",
                                  allowshift=False):
    """ Run MAFFT alignment on a group of sequences without a substitution matrix.

	This function requires a proper path to the mafft binary.

	Args:
		inputSequences (array): an array of strings of sequences to be aligned
		gap_open (float): gap open penalty
		gap_ext (float): gap extent penalty
		method (str): MAFFT's alignment strategy
		allowshift (bool):  see MAFFT documentation

	Returns:
		output (array): aligned strings of sequences

	"""

    tempfile = "temp/tempseq.fasta"
    tempfileout = "temp/tempmsa.fasta"
    tempfileout2 = "temp/tempmsa2.fasta"
    io.exportGroupOfSequencesToFASTA(inputSequences, tempfile)

    if allowshift == True:
        command = "mafft-mac/mafft.bat --quiet --op " + str(
            gap_open
        ) + " --ep " + str(
            gap_ext
        ) + " --" + method + " --maxiterate 1000 --allowshift  --text " + tempfile + " > " + tempfileout2
    else:
        command = "mafft-mac/mafft.bat --quiet --op " + str(
            gap_open
        ) + " --ep " + str(
            gap_ext
        ) + " --" + method + " --maxiterate 1000   --text " + tempfile + " > " + tempfileout2
    os.system(command)

    output = io.readFASTA(tempfileout2)
    return output
def tcoffeeAlignment(sequences, go, ge):
    """ The standard T-COFFEE alignment.

	For more info: http://www.tcoffee.org/Projects/tcoffee/#DOCUMENTATION
	
	Args:
		sequences (array): array of strings to be aligned
		go (int): gap open penalty
		ge (int): gap extent penalty

	Returns:
		output (array): aligned sequences


	"""

    io.exportGroupOfSequencesToFasta(sequences, "temp/tcoffee-in.fasta")
    command = "./t_coffee 'temp/tcoffee-in.fasta' -quiet -output fasta -outfile 'temp/tcoffee-out.fasta'"
    os.system(command)
    output = io.readFASTA("temp/tcoffee-out.fasta")
    return output
def tcoffeeAlignmentWithMatrix(sequences, go, ge, matrixfile):
    """ The standard T-COFFEE alignment with custom matrix 

	For more info: http://www.tcoffee.org/Projects/tcoffee/#DOCUMENTATION
	
	Args:
		sequences (array): array of strings to be aligned
		go (int): gap open penalty
		ge (int): gap extent penalty
		matrixfile (str): file path to an appropriate matrix file

	Returns:
		output (array): aligned sequences

	"""

    io.exportGroupOfSequencesToFasta(sequences, "temp/tcoffee-in.fasta")
    command = "./t_coffee 'temp/tcoffee-in.fasta' -in = X" + matrixfile + "  -gapopen = " + str(
        go) + " -gapext = " + str(
            ge) + " -quiet -output fasta -outfile 'temp/tcoffee-out.fasta'"
    os.system(command)
    output = io.readFASTA("temp/tcoffee-out.fasta")
    return output
Пример #5
0
# align them using MAFFT
MSA = al.runMafftAlignmentWithSettings(sequences,
                                       2,
                                       1,
                                       method="globalpair",
                                       allowshift=False)
print "Aligned sequences:"
al.printMSA(MSA)

# Create a Krogh profile HMM from a folk tune family
# First we need the emission probabilities gathered from the whole dataset
directory = "NotAligned/NLBproperSmall"
files = io.filesInPath(directory)
allPossibleSymbols = "".join([
    "".join(["".join(seq) for seq in io.readFASTA(directory + "/" + file)])
    for file in files
]).replace("-", "")
counts = collections.Counter(allPossibleSymbols)  # count
alphabet = [symbol for symbol in counts]  # all possible symbols
counts = np.array([counts[symbol]
                   for symbol in counts])  # corresponding counts
EmissionProbabilities = dict(zip(alphabet, counts /
                                 np.sum(counts)))  # convert to dictionary

# Bake the Krogh profile HMM model
sw = 10
pct = 1
pec = 1 / 12.
grms = 0.95
model, columnsIsMatchState = krogh.profileHMM(