def test(): stree = GeneralisedSuffixTree(['mississippi']) for shared in stree.sharedSubstrings(2): for seq,start,stop in shared: print seq, '['+str(start)+':'+str(stop)+']', print stree.sequences[seq][start:stop], print stree.sequences[seq][:start]+'|'+stree.sequences[seq][start:stop]+'|'+stree.sequences[seq][stop:]
def test(): stree = GeneralisedSuffixTree(['mississippi']) for shared in stree.sharedSubstrings(2): for seq, start, stop in shared: print seq, '[' + str(start) + ':' + str(stop) + ']', print stree.sequences[seq][start:stop], print stree.sequences[seq][:start] + '|' + stree.sequences[seq][ start:stop] + '|' + stree.sequences[seq][stop:]
def lcsm(strings): stree = GeneralisedSuffixTree(strings) try: max_tuples = max(stree.sharedSubstrings(), key=lambda ss: ss[0][2] - ss[0][1]) num, start, stop = max_tuples[0] return stree.sequences[num][start:stop] except TypeError: return None
def getMAXchSTR(string): stree = GeneralisedSuffixTree([string]) ## record the max length of shared substring and the substring maxlength = 0;register = "" for shared in stree.sharedSubstrings(15): ## five chinese characters for seq,start,stop in shared: if (stop-start) > maxlength: maxlength = stop - start register = stree.sequences[seq][start:stop] #print type(register),register.decode('utf8') return maxlength,register
def getMAXchSTR(string): stree = GeneralisedSuffixTree([string]) ## record the max length of shared substring and the substring maxlength = 0 register = "" for shared in stree.sharedSubstrings(15): ## five chinese characters for seq, start, stop in shared: if (stop - start) > maxlength: maxlength = stop - start register = stree.sequences[seq][start:stop] #print type(register),register.decode('utf8') return maxlength, register
def genrule(str_seq): stree = GeneralisedSuffixTree(str_seq) for shared in stree.sharedSubstrings(50): print '-'*70 # print shared for seq,start,stop in shared: if stop-start > 20: print seq, '['+str(start)+':'+str(stop)+']', print str_seq[seq][start:stop], print str_seq[:start]+'|'+ str_seq[seq][start:stop]+'|'+ str_seq[seq][stop:] print '='*70
def LCS(s1, s2): from suffix_tree import GeneralisedSuffixTree seqs = [s1, s2] us1 = unicode(s1, 'utf-8') us2 = unicode(s2, 'utf-8') stree = GeneralisedSuffixTree([us1, us2]) longs = set() for shared in stree.sharedSubstrings(): for seq, start, stop in shared: sub = seqs[seq][start:stop] longs.add(sub) return max(longs, key=len)
def parseBench(f1, f2, f3, f4, f5): (bin1, bin1arr, bin1fun, bin1all) = parse(f1) (bin2, bin2arr, bin2fun, bin2all) = parse(f2) (bin3, bin3arr, bin3fun, bin3all) = parse(f3) (bin4, bin4arr, bin4fun, bin4all) = parse(f4) (bin5, bin5arr, bin5fun, bin5all) = parse(f5) bin1arrB = ''.join(chr(x) for x in bin1arr) bin2arrB = ''.join(chr(x) for x in bin2arr) bin3arrB = ''.join(chr(x) for x in bin3arr) bin4arrB = ''.join(chr(x) for x in bin4arr) bin5arrB = ''.join(chr(x) for x in bin5arr) hex1arr = binascii.hexlify(bin1arrB) hex2arr = binascii.hexlify(bin2arrB) hex3arr = binascii.hexlify(bin3arrB) hex4arr = binascii.hexlify(bin4arrB) hex5arr = binascii.hexlify(bin5arrB) matches = {} matches3 = {} matches4 = {} matches5 = {} stree = GeneralisedSuffixTree([hex1arr, hex2arr]) for shared in stree.sharedSubstrings(20): for seq, start, stop in shared: if seq == 0: leng = (stop-start)/2 if leng in matches: matches[leng] += 1 else: matches[leng] = 1 match = hex1arr[start:stop] if match in hex3arr: if leng in matches3: matches3[leng] += 1 else: matches3[leng] = 1 if match in hex4arr: if leng in matches4: matches4[leng] += 1 else: matches4[leng] = 1 if match in hex5arr: if leng in matches5: matches5[leng] += 1 else: matches5[leng] = 1 return (matches, matches3, matches4, matches5)
def main(): with open(sys.argv[1], 'r') as fi: seq_1 = fi.readline().strip() seq_2 = fi.readline().strip() seqs = [seq_1, seq_2] stree = GeneralisedSuffixTree(seqs) max_len = 0 max_str = '' for shared in stree.sharedSubstrings(): for seq, start, stop in shared: cs = seqs[seq][start:stop] if len(cs) > max_len: max_len = len(cs) max_str = cs print(max_str)
def __computeLCS(self, stringList): ''' Returns a one-element list containing the LCS of the input stringList ''' alphabet = self.__getAlphabet( stringList) # get alphabet of (all characters in) stringList # check if alphabet requires too many characters to create enough terminal characters # for each string in stringList if not self.__isComputable(stringList, alphabet): strLstLen = len(stringList) return self.__computeLCS(self.__computeLCS(stringList[0:strLstLen/2]) + \ self.__computeLCS(stringList[strLstLen/2:strLstLen])) (stringList, translationDict) = self.__translateCharacters( stringList, alphabet) # translate characters in stringList # make suffix tree stree = GeneralisedSuffixTree(stringList) # get all shared substrings sharedSubstrings = [] for shared in stree.sharedSubstrings(): for seq, start, stop in shared: sharedSubstrings += [stree.sequences[seq][start:stop]] # find the index of the longest shared substring substringLens = [len(substring) for substring in sharedSubstrings] if substringLens == []: lcs = [""] return lcs longestSubstringIndex = substringLens.index(max(substringLens)) lcs = sharedSubstrings[longestSubstringIndex] # Back translate for (translatedChar, originalChar) in translationDict.iteritems(): lcs = lcs.replace(translatedChar, originalChar) return [lcs]
def lcsm(strings): gst = GeneralisedSuffixTree(strings) max_tuples = max(gst.sharedSubstrings(), key=lambda xs: xs[0][2] - xs[0][1]) index, start, stop = max_tuples[0] return strings[index][start:stop]
baseComplement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'} def revc(seq): return "".join([baseComplement[base] for base in seq[::-1]]) # Build a random string, which should have some short reverse complements already. bases = ['A', 'C', 'G', 'T'] data = ''.join(choice(bases) for i in xrange(400000)) #data = "AGGGTTTCCCTGACCTTCACTGCAGGTCATGCA" # revc TGCATGACCTGCAGTGAAGGTCAGGGAAACCCT # 012345678901234567890123456789012 # 1 2 3 print "Got data" revdata = revc(data) print "Got reverse data" n = len(data) minlength = 18 tree = GeneralisedSuffixTree([data, revdata]) for shared in tree.sharedSubstrings(minlength): _, start, stop = shared[0] seq = data[start:stop] _, rstart, rstop = shared[1] rseq = data[n - rstop:n - rstart] print "Match: {0} at [{1}:{2}] and {3} at [{4}:{5}]".format( seq, start, stop, rseq, n - rstop, n - rstart)
def validateSubstring(strings, seq): for s in strings: if s.find(seq) == -1: return False return True with open('rosalind_lcs.txt') as spec: data = [seq.strip() for seq in spec] # The generalized suffix tree doesn't work well with a large number of strings. # Use the first 10 to generate candidates, and then compare each candidate # (in decreasing length order) to the data to find a common substring. tree = GeneralisedSuffixTree(data[:10]) candidates = [] for shared in tree.sharedSubstrings(5): for seq, start, stop in shared: candidates.append(tree.sequences[seq][start:stop]) break candidates.sort(cmp=None, key=lambda s: len(s), reverse=True) for c in candidates: if validateSubstring(data, c): print c print len(c) break else: print "No common string found!"
from random import choice baseComplement = { 'A' : 'T', 'C' : 'G', 'G' : 'C', 'T' : 'A' } def revc(seq): return "".join([baseComplement[base] for base in seq[::-1]]) # Build a random string, which should have some short reverse complements already. bases = ['A', 'C', 'G', 'T'] data = ''.join(choice(bases) for i in xrange(400000)) #data = "AGGGTTTCCCTGACCTTCACTGCAGGTCATGCA" # revc TGCATGACCTGCAGTGAAGGTCAGGGAAACCCT # 012345678901234567890123456789012 # 1 2 3 print "Got data" revdata = revc(data) print "Got reverse data" n = len(data) minlength = 18 tree = GeneralisedSuffixTree([data, revdata]) for shared in tree.sharedSubstrings(minlength): _, start, stop = shared[0] seq = data[start:stop] _, rstart, rstop = shared[1] rseq = data[n-rstop:n-rstart] print "Match: {0} at [{1}:{2}] and {3} at [{4}:{5}]".format(seq, start, stop, rseq, n-rstop, n-rstart)
#!/usr/bin/env python # -*- coding: utf-8 -*- from suffix_tree import GeneralisedSuffixTree # s1 = u'mississippi' # s2 = u'sippissi' s1 = u'一寸光阴一寸金'; s2 = u'寸金难买寸光阴'; stree = GeneralisedSuffixTree([s1,s2]) for shared in stree.sharedSubstrings(2): print '-'*70 for (seq,start,stop) in shared: print seq, print '['+str(start)+':'+str(stop)+']', ss = stree.sequences[seq][start:stop] print ss.encode('utf-8'), at = stree.sequences[seq][:start]+\ '{'+stree.sequences[seq][start:stop]+'}'+\ stree.sequences[seq][stop:] print at.encode('utf-8') print '='*70
import sys out = open("output.txt", 'w') Input = open(sys.argv[1], 'r').read().split("\n") text1 = Input[0].strip() text2 = Input[1].strip() from suffix_tree import GeneralisedSuffixTree stree = GeneralisedSuffixTree([text1, text2]) max_len = 0 for shared in stree.sharedSubstrings(): seq, start, stop = shared[0] if (stop - start) > max_len: max_len = stop - start longestSS = stree.sequences[seq][start:stop] print >> out, longestSS
from suffix_tree import GeneralisedSuffixTree def validateSubstring(strings, seq): for s in strings: if s.find(seq) == -1: return False return True with open('rosalind_lcs.txt') as spec: data = [seq.strip() for seq in spec] # The generalized suffix tree doesn't work well with a large number of strings. # Use the first 10 to generate candidates, and then compare each candidate # (in decreasing length order) to the data to find a common substring. tree = GeneralisedSuffixTree(data[:10]) candidates = [] for shared in tree.sharedSubstrings(5): for seq, start, stop in shared: candidates.append(tree.sequences[seq][start:stop]) break candidates.sort(cmp=None, key=lambda s: len(s), reverse=True) for c in candidates: if validateSubstring(data, c): print c print len(c) break else: print "No common string found!"
import sys out=open("output.txt",'w') Input=open(sys.argv[1],'r').read().split("\n") text1=Input[0].strip() text2=Input[1].strip() from suffix_tree import GeneralisedSuffixTree stree = GeneralisedSuffixTree([text1,text2]) max_len=0 for shared in stree.sharedSubstrings(): seq,start,stop=shared[0] if (stop-start)> max_len: max_len=stop-start longestSS=stree.sequences[seq][start:stop] print >>out, longestSS
def parseBench(f1, f2, f3, f4, f5): (bin1, bin1arr, bin1fun, bin1all) = parse(f1) (bin2, bin2arr, bin2fun, bin2all) = parse(f2) (bin3, bin3arr, bin3fun, bin3all) = parse(f3) (bin4, bin4arr, bin4fun, bin4all) = parse(f4) (bin5, bin5arr, bin5fun, bin5all) = parse(f5) bin1arrB = ''.join(chr(x) for x in bin1arr) bin2arrB = ''.join(chr(x) for x in bin2arr) bin3arrB = ''.join(chr(x) for x in bin3arr) bin4arrB = ''.join(chr(x) for x in bin4arr) bin5arrB = ''.join(chr(x) for x in bin5arr) hex1arr = binascii.hexlify(bin1arrB) hex2arr = binascii.hexlify(bin2arrB) hex3arr = binascii.hexlify(bin3arrB) hex4arr = binascii.hexlify(bin4arrB) hex5arr = binascii.hexlify(bin5arrB) matches = {} matches3 = {} matches4 = {} matches5 = {} stree = GeneralisedSuffixTree([hex1arr, hex2arr]) for shared in stree.sharedSubstrings(20): for seq, start, stop in shared: if seq == 0: leng = (stop-start)/2 if leng in matches: matches[leng] += 1 else: matches[leng] = 1 match = hex1arr[start:stop] if match in hex3arr: if leng in matches3: matches3[leng] += 1 else: matches3[leng] = 1 if match in hex4arr: if leng in matches4: matches4[leng] += 1 else: matches4[leng] = 1 if match in hex5arr: if leng in matches5: matches5[leng] += 1 else: matches5[leng] = 1 last = 9 for i in range(len(f1[0:-3])): sys.stdout.write(" ") sys.stdout.write("\t") for i in range(len(matches)): sys.stdout.write(str(i+10)+"\t") sys.stdout.write("\n"+f1[0:-3]+"\t") for i in sorted(matches): sys.stdout.write(str(matches[i])+"\t") sys.stdout.write("\n") for i in range(len(f1[0:-3])): sys.stdout.write(" ") sys.stdout.write("\t") for i in sorted(matches3): sys.stdout.write(str(matches3[i])+"\t") sys.stdout.write("\n") for i in range(len(f1[0:-3])): sys.stdout.write(" ") sys.stdout.write("\t") for i in sorted(matches4): sys.stdout.write(str(matches4[i])+"\t") sys.stdout.write("\n") for i in range(len(f1[0:-3])): sys.stdout.write(" ") sys.stdout.write("\t") for i in sorted(matches5): sys.stdout.write(str(matches5[i])+"\t") sys.stdout.write("\n")