def test(): stree = GeneralisedSuffixTree(['mississippi']) for shared in stree.sharedSubstrings(2): for seq, start, stop in shared: print seq, '[' + str(start) + ':' + str(stop) + ']', print stree.sequences[seq][start:stop], print stree.sequences[seq][:start] + '|' + stree.sequences[seq][ start:stop] + '|' + stree.sequences[seq][stop:]
def getMAXchSTR(string): stree = GeneralisedSuffixTree([string]) ## record the max length of shared substring and the substring maxlength = 0 register = "" for shared in stree.sharedSubstrings(15): ## five chinese characters for seq, start, stop in shared: if (stop - start) > maxlength: maxlength = stop - start register = stree.sequences[seq][start:stop] #print type(register),register.decode('utf8') return maxlength, register
def main(): with open(sys.argv[1], 'r') as fi: seq_1 = fi.readline().strip() seq_2 = fi.readline().strip() seqs = [seq_1, seq_2] stree = GeneralisedSuffixTree(seqs) max_len = 0 max_str = '' for shared in stree.sharedSubstrings(): for seq, start, stop in shared: cs = seqs[seq][start:stop] if len(cs) > max_len: max_len = len(cs) max_str = cs print(max_str)
def __computeLCS(self, stringList): ''' Returns a one-element list containing the LCS of the input stringList ''' alphabet = self.__getAlphabet( stringList) # get alphabet of (all characters in) stringList # check if alphabet requires too many characters to create enough terminal characters # for each string in stringList if not self.__isComputable(stringList, alphabet): strLstLen = len(stringList) return self.__computeLCS(self.__computeLCS(stringList[0:strLstLen/2]) + \ self.__computeLCS(stringList[strLstLen/2:strLstLen])) (stringList, translationDict) = self.__translateCharacters( stringList, alphabet) # translate characters in stringList # make suffix tree stree = GeneralisedSuffixTree(stringList) # get all shared substrings sharedSubstrings = [] for shared in stree.sharedSubstrings(): for seq, start, stop in shared: sharedSubstrings += [stree.sequences[seq][start:stop]] # find the index of the longest shared substring substringLens = [len(substring) for substring in sharedSubstrings] if substringLens == []: lcs = [""] return lcs longestSubstringIndex = substringLens.index(max(substringLens)) lcs = sharedSubstrings[longestSubstringIndex] # Back translate for (translatedChar, originalChar) in translationDict.iteritems(): lcs = lcs.replace(translatedChar, originalChar) return [lcs]
import sys out = open("output.txt", 'w') Input = open(sys.argv[1], 'r').read().split("\n") text1 = Input[0].strip() text2 = Input[1].strip() from suffix_tree import GeneralisedSuffixTree stree = GeneralisedSuffixTree([text1, text2]) max_len = 0 for shared in stree.sharedSubstrings(): seq, start, stop = shared[0] if (stop - start) > max_len: max_len = stop - start longestSS = stree.sequences[seq][start:stop] print >> out, longestSS
baseComplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} def revc(seq): return "".join([baseComplement[base] for base in seq[::-1]]) # Build a random string, which should have some short reverse complements already. bases = ['A', 'C', 'G', 'T'] data = ''.join(choice(bases) for i in xrange(400000)) #data = "AGGGTTTCCCTGACCTTCACTGCAGGTCATGCA" # revc TGCATGACCTGCAGTGAAGGTCAGGGAAACCCT # 012345678901234567890123456789012 # 1 2 3 print "Got data" revdata = revc(data) print "Got reverse data" n = len(data) minlength = 18 tree = GeneralisedSuffixTree([data, revdata]) for shared in tree.sharedSubstrings(minlength): _, start, stop = shared[0] seq = data[start:stop] _, rstart, rstop = shared[1] rseq = data[n - rstop:n - rstart] print "Match: {0} at [{1}:{2}] and {3} at [{4}:{5}]".format( seq, start, stop, rseq, n - rstop, n - rstart)
#!/usr/bin/env python # -*- coding: utf-8 -*- from suffix_tree import GeneralisedSuffixTree # s1 = u'mississippi' # s2 = u'sippissi' s1 = u'一寸光阴一寸金'; s2 = u'寸金难买寸光阴'; stree = GeneralisedSuffixTree([s1,s2]) for shared in stree.sharedSubstrings(2): print '-'*70 for (seq,start,stop) in shared: print seq, print '['+str(start)+':'+str(stop)+']', ss = stree.sequences[seq][start:stop] print ss.encode('utf-8'), at = stree.sequences[seq][:start]+\ '{'+stree.sequences[seq][start:stop]+'}'+\ stree.sequences[seq][stop:] print at.encode('utf-8') print '='*70
def validateSubstring(strings, seq): for s in strings: if s.find(seq) == -1: return False return True with open('rosalind_lcs.txt') as spec: data = [seq.strip() for seq in spec] # The generalized suffix tree doesn't work well with a large number of strings. # Use the first 10 to generate candidates, and then compare each candidate # (in decreasing length order) to the data to find a common substring. tree = GeneralisedSuffixTree(data[:10]) candidates = [] for shared in tree.sharedSubstrings(5): for seq, start, stop in shared: candidates.append(tree.sequences[seq][start:stop]) break candidates.sort(cmp=None, key=lambda s: len(s), reverse=True) for c in candidates: if validateSubstring(data, c): print c print len(c) break else: print "No common string found!"