예제 #1
0
def test():
    stree = GeneralisedSuffixTree(['mississippi'])
    for shared in stree.sharedSubstrings(2):
        for seq, start, stop in shared:
            print seq, '[' + str(start) + ':' + str(stop) + ']',
            print stree.sequences[seq][start:stop],
            print stree.sequences[seq][:start] + '|' + stree.sequences[seq][
                start:stop] + '|' + stree.sequences[seq][stop:]
예제 #2
0
def getMAXchSTR(string):
    stree = GeneralisedSuffixTree([string])
    ## record the max length of shared substring and the substring
    maxlength = 0
    register = ""
    for shared in stree.sharedSubstrings(15):  ## five chinese characters
        for seq, start, stop in shared:
            if (stop - start) > maxlength:
                maxlength = stop - start
                register = stree.sequences[seq][start:stop]
    #print type(register),register.decode('utf8')
    return maxlength, register
예제 #3
0
def main():
    with open(sys.argv[1], 'r') as fi:
        seq_1 = fi.readline().strip()
        seq_2 = fi.readline().strip()

    seqs = [seq_1, seq_2]
    stree = GeneralisedSuffixTree(seqs)

    max_len = 0
    max_str = ''
    for shared in stree.sharedSubstrings():
        for seq, start, stop in shared:
            cs = seqs[seq][start:stop]
            if len(cs) > max_len:
                max_len = len(cs)
                max_str = cs

    print(max_str)
예제 #4
0
    def __computeLCS(self, stringList):
        '''
        Returns a one-element list containing the LCS of the input stringList  
        '''

        alphabet = self.__getAlphabet(
            stringList)  # get alphabet of (all characters in) stringList

        # check if alphabet requires too many characters to create enough terminal characters
        # for each string in stringList
        if not self.__isComputable(stringList, alphabet):
            strLstLen = len(stringList)
            return self.__computeLCS(self.__computeLCS(stringList[0:strLstLen/2]) + \
                                     self.__computeLCS(stringList[strLstLen/2:strLstLen]))

        (stringList, translationDict) = self.__translateCharacters(
            stringList, alphabet)  # translate characters in stringList

        # make suffix tree
        stree = GeneralisedSuffixTree(stringList)
        # get all shared substrings
        sharedSubstrings = []
        for shared in stree.sharedSubstrings():
            for seq, start, stop in shared:
                sharedSubstrings += [stree.sequences[seq][start:stop]]

        # find the index of the longest shared substring
        substringLens = [len(substring) for substring in sharedSubstrings]
        if substringLens == []:
            lcs = [""]
            return lcs
        longestSubstringIndex = substringLens.index(max(substringLens))

        lcs = sharedSubstrings[longestSubstringIndex]
        # Back translate
        for (translatedChar, originalChar) in translationDict.iteritems():
            lcs = lcs.replace(translatedChar, originalChar)
        return [lcs]
예제 #5
0
import sys

out = open("output.txt", 'w')

Input = open(sys.argv[1], 'r').read().split("\n")

text1 = Input[0].strip()
text2 = Input[1].strip()

from suffix_tree import GeneralisedSuffixTree

stree = GeneralisedSuffixTree([text1, text2])

max_len = 0
for shared in stree.sharedSubstrings():
    seq, start, stop = shared[0]
    if (stop - start) > max_len:
        max_len = stop - start
        longestSS = stree.sequences[seq][start:stop]
print >> out, longestSS
예제 #6
0
baseComplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}


def revc(seq):
    return "".join([baseComplement[base] for base in seq[::-1]])


# Build a random string, which should have some short reverse complements already.
bases = ['A', 'C', 'G', 'T']
data = ''.join(choice(bases) for i in xrange(400000))
#data = "AGGGTTTCCCTGACCTTCACTGCAGGTCATGCA"
# revc  TGCATGACCTGCAGTGAAGGTCAGGGAAACCCT
#       012345678901234567890123456789012
#                 1         2         3

print "Got data"
revdata = revc(data)
print "Got reverse data"

n = len(data)
minlength = 18
tree = GeneralisedSuffixTree([data, revdata])
for shared in tree.sharedSubstrings(minlength):
    _, start, stop = shared[0]
    seq = data[start:stop]
    _, rstart, rstop = shared[1]
    rseq = data[n - rstop:n - rstart]
    print "Match: {0} at [{1}:{2}] and {3} at [{4}:{5}]".format(
        seq, start, stop, rseq, n - rstop, n - rstart)
예제 #7
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from suffix_tree import GeneralisedSuffixTree

# s1 = u'mississippi'
# s2 = u'sippissi'

s1 = u'一寸光阴一寸金';
s2 = u'寸金难买寸光阴';
stree = GeneralisedSuffixTree([s1,s2]) 

for shared in stree.sharedSubstrings(2):
    print '-'*70
    for (seq,start,stop) in shared:
       print seq, 
       print '['+str(start)+':'+str(stop)+']',
       ss = stree.sequences[seq][start:stop]
       print ss.encode('utf-8'),
       at = stree.sequences[seq][:start]+\
                    '{'+stree.sequences[seq][start:stop]+'}'+\
                    stree.sequences[seq][stop:]
       print at.encode('utf-8')
print '='*70 
예제 #8
0

def validateSubstring(strings, seq):
    for s in strings:
        if s.find(seq) == -1:
            return False
    return True


with open('rosalind_lcs.txt') as spec:
    data = [seq.strip() for seq in spec]

    # The generalized suffix tree doesn't work well with a large number of strings.
    # Use the first 10 to generate candidates, and then compare each candidate
    # (in decreasing length order) to the data to find a common substring.
    tree = GeneralisedSuffixTree(data[:10])
    candidates = []
    for shared in tree.sharedSubstrings(5):
        for seq, start, stop in shared:
            candidates.append(tree.sequences[seq][start:stop])
            break

    candidates.sort(cmp=None, key=lambda s: len(s), reverse=True)
    for c in candidates:
        if validateSubstring(data, c):
            print c
            print len(c)
            break
    else:
        print "No common string found!"