def Cij(pos1,pos2): """ Calculate the Hofacker covariation between two RNA positions in an alignment. Each position must be converted into a string of RNA. Modules required: - sys - hammdist - equLen Usage: <sequence1> <sequence2> References: - Hofacker et al. 2002. Secondary Structure Prediction for Aligned RNA Sequences. J. Mol. Biol. 319: 1059-1066. """ equLen(pos1,pos2) allow, Cov = ['CG','GC','AU','UA','GU','UG'], 0 for a in range(6): for b in range(6): if b>a: fxy, fyz = 0, 0 for c in range(len(pos1)): if pos1[c]+pos2[c] == allow[a]: fxy += 1 if pos1[c]+pos2[c] == allow[b]: fyz += 1 hamm = hammDist(allow[a],allow[b]) Cov += (fxy/float(len(pos1)))*hamm*(fyz/float(len(pos2))) return Cov
def basepairs(seq1, seq2, acid='RNA'): """ Return two sequences that represents all the basepairs presents in the sequences inputted Usage: <sequence 1> <sequence 2> """ if acid == 'RNA': allow = ['AA', 'AC', 'AG', 'AU', 'CA', 'CC', 'CG', 'CU', 'GA', 'GC', 'GG', 'GU', 'UA', 'UC', 'UG', 'UU'] if acid == 'DNA': allow = ['AA', 'AC', 'AG', 'AT', 'CA', 'CC', 'CG', 'CT', 'GA', 'GC', 'GG', 'GT', 'TA', 'TC', 'TG', 'TT'] total = [] equLen(seq1, seq2) for index in range(len(seq1)): if (seq1[index]+seq2[index] in allow) and (seq1[index]+seq2[index] not in total): total.append(seq1[index]+seq2[index]) pos1, pos2 = '','' for elem in total: pos1 += elem[0] pos2 += elem[1] return pos1, pos2
def similarity(str1, str2): """ Returns the similarity index between two string. Both string must have equal lenght. Module required: - equLen (from rnatk.stats) Usage: <string 1> <string 2> """ equLen(str1, str2) matches = 0.0 for index in range(len(str1)): if str1[index] == str2[index]: matches += 1 similarity = matches/len(str1) return similarity
def hammDist(str1,str2): """ Find the Hamming distance between the two strings. Modules required: - sys - equLen Usage: <string 1> <string 2> """ equLen(str1,str2) diff = 0 for a, b in zip(str1,str2): if a != b: diff += 1 return diff
def replaceGapPos_by_consensus(sequence, consensus): """ Give a sequence, this function replace the gaps by the correspondent nucleotide from the consensus sequence. Module required: - equLen (rnatk.stats) Usage: <sequence> <consensus> """ equLen(sequence, consensus) sequence = list(sequence) for index in range(len(sequence)): if sequence[index] == '-': sequence[index] = consensus[index] return ''.join(sequence)
def fit_to_secondStruct(secStr, sequence): """ Given a RNA sequence and a RNA secondary structure in dot-bracket notation, this function try to fit the structure to the sequence and return such scondary structure in dot-bracket notation. Module required: - equLen (from rnatk.stats) Usage: <secondary structure> <RNA sequence> """ equLen(secStr, sequence) # chech if the length of the structure is equal to the sequence charLst = list(secStr) # build a list where each item is one character of the secondary structure inputted if charLst.count('(') != charLst.count(')'): # this check if the amount of '(' characters is equal to the amount of # ')' characters (which it should be!) print 'A bracket is missing. Quitting' sys.exit() index, search = 0, 1 allow = ['CG','GC','AU','UA','GU','UG'] while ('(' in charLst) and (')' in charLst): repeat = False if charLst[index] == '(': # this is the reference point (index) for search the next ')' character if charLst[index+search] == '.' or charLst[index+search] == 'r' or charLst[index+search] == 'l': search += 1 if charLst[index+search] == '(': # if the next character is this (open-bracket), this is the new reference # point and the process restart with it. index += search # New reference point search = 1 # the search restart to 1 repeat = True # this means that this new reference point must be ignored by the next line if (charLst[index+search] == ')') and repeat==False: if (sequence[index]+sequence[index+search]) in allow: charLst[index] = 'l' charLst[index+search] = 'r' else: charLst[index] = '.' charLst[index+search] = '.' index = 0 search = 1 else: index += 1 result = ''.join(charLst) # here the resulted is compiled to a string variable return result.replace('l', '(').replace('r', ')')
def entropy(logbase, *seqs): """ This function find the entropy for one or several sequences (joint entropy) Equation used here: H(X_1, ..., X_n) = -sum{x_1}...sum{x_n} P(x_1, ..., x_n)log[P(x_1, ..., x_n)] Module required: - math Usage: <logbase> <seq1> ... <seq2> """ equLen(*seqs) joint_list = [] for index_A in range(len(seqs[0])): paired = '' for index_B in range(len(seqs)): paired += seqs[index_B][index_A] joint_list.append(paired) set_pair = set(joint_list) Hjoint = 0 for elem in set_pair: prob = joint_list.count(elem)/float(len(joint_list)) Hjoint -= prob*math.log(prob,logbase) return Hjoint