示例#1
0
def Cij(pos1,pos2):
    """
    Calculate the Hofacker covariation between two RNA positions
    in an alignment. Each position must be converted into a string
    of RNA.
    Modules required:
    - sys
    - hammdist
    - equLen
    Usage: <sequence1> <sequence2>
    References:
    - Hofacker et al. 2002. Secondary Structure Prediction for
      Aligned RNA Sequences. J. Mol. Biol. 319: 1059-1066.
    """
    equLen(pos1,pos2)
    allow, Cov = ['CG','GC','AU','UA','GU','UG'], 0
    for a in range(6):
        for b in range(6):
            if b>a:
                fxy, fyz = 0, 0
                for c in range(len(pos1)):
                    if pos1[c]+pos2[c] == allow[a]: fxy += 1
                    if pos1[c]+pos2[c] == allow[b]: fyz += 1
                hamm = hammDist(allow[a],allow[b]) 
                Cov += (fxy/float(len(pos1)))*hamm*(fyz/float(len(pos2)))
    return Cov
示例#2
0
def basepairs(seq1, seq2, acid='RNA'):
    """
    Return two sequences that represents all the basepairs presents in the sequences inputted
    Usage: <sequence 1> <sequence 2>
    """
    if acid == 'RNA':
        allow = ['AA', 'AC', 'AG', 'AU',
                 'CA', 'CC', 'CG', 'CU',
                 'GA', 'GC', 'GG', 'GU',
                 'UA', 'UC', 'UG', 'UU']
    if acid == 'DNA':
        allow = ['AA', 'AC', 'AG', 'AT',
                 'CA', 'CC', 'CG', 'CT',
                 'GA', 'GC', 'GG', 'GT',
                 'TA', 'TC', 'TG', 'TT']
    total = []
    equLen(seq1, seq2)
    for index in range(len(seq1)):
        if (seq1[index]+seq2[index] in allow) and (seq1[index]+seq2[index] not in total):
            total.append(seq1[index]+seq2[index])
    pos1, pos2 = '',''
    for elem in total:
        pos1 += elem[0]
        pos2 += elem[1]
    return pos1, pos2
示例#3
0
def similarity(str1, str2):
    """
    Returns the similarity index between two string. Both string must have equal lenght.
    Module required:
    - equLen (from rnatk.stats)
    Usage: <string 1> <string 2>
    """
    equLen(str1, str2)
    matches = 0.0
    for index in range(len(str1)):
        if str1[index] == str2[index]:
            matches += 1
    similarity = matches/len(str1)
    return similarity
示例#4
0
def hammDist(str1,str2):
    """
    Find the Hamming distance between the two strings.
    Modules required:
    - sys
    - equLen
    Usage: <string 1> <string 2>
    """
    equLen(str1,str2)
    diff = 0
    for a, b in zip(str1,str2):
        if a != b:
            diff += 1
    return diff
示例#5
0
def replaceGapPos_by_consensus(sequence, consensus):
    """
    Give a sequence, this function replace the gaps by the correspondent
    nucleotide from the consensus sequence.
    Module required:
    - equLen (rnatk.stats)
    Usage: <sequence> <consensus>
    """
    equLen(sequence, consensus)
    sequence = list(sequence)
    for index in range(len(sequence)):
        if sequence[index] == '-':
            sequence[index] = consensus[index]
    return ''.join(sequence)
示例#6
0
def fit_to_secondStruct(secStr, sequence):
    """
    Given a RNA sequence and a RNA secondary structure in dot-bracket notation, this function try to fit the
    structure to the sequence and return such scondary structure in dot-bracket notation.
    Module required:
    - equLen (from rnatk.stats)
    Usage: <secondary structure> <RNA sequence>
    """
    equLen(secStr, sequence) # chech if the length of the structure is equal to the sequence
    charLst = list(secStr) # build a list where each item is one character of the secondary structure inputted
    if charLst.count('(') != charLst.count(')'): # this check if the amount of '(' characters is equal to the amount of
                                                 # ')' characters (which it should be!)
        print 'A bracket is missing. Quitting'
        sys.exit()
    index, search = 0, 1
    allow = ['CG','GC','AU','UA','GU','UG']
    while ('(' in charLst) and (')' in charLst):
        repeat = False
        if charLst[index] == '(': # this is the reference point (index) for search the next ')' character
            if charLst[index+search] == '.' or charLst[index+search] == 'r' or charLst[index+search] == 'l':
                search += 1
            if charLst[index+search] == '(': # if the next character is this (open-bracket), this is the new reference
                                             # point and the process restart with it.
                index += search # New reference point
                search = 1 # the search restart to 1
                repeat = True # this means that this new reference point must be ignored by the next line
            if (charLst[index+search] == ')') and repeat==False:
                if (sequence[index]+sequence[index+search]) in allow:
                    charLst[index] = 'l'
                    charLst[index+search] = 'r'
                else:
                    charLst[index] = '.'
                    charLst[index+search] = '.'
                index = 0
                search = 1
        else:
            index += 1
    result = ''.join(charLst) # here the resulted is compiled to a string variable
    return result.replace('l', '(').replace('r', ')')
示例#7
0
def entropy(logbase, *seqs):
    """
    This function find the entropy for one or several sequences (joint entropy)
    Equation used here:
    H(X_1, ..., X_n) = -sum{x_1}...sum{x_n} P(x_1, ..., x_n)log[P(x_1, ..., x_n)]
    Module required:
    - math
    Usage: <logbase> <seq1> ... <seq2>
    """
    equLen(*seqs)
    joint_list = []
    for index_A in range(len(seqs[0])):
        paired = ''
        for index_B in range(len(seqs)):
            paired += seqs[index_B][index_A]
        joint_list.append(paired)
    set_pair = set(joint_list)
    Hjoint = 0
    for elem in set_pair:
        prob = joint_list.count(elem)/float(len(joint_list))
        Hjoint -= prob*math.log(prob,logbase)
    return Hjoint