def search2(P, sStr): ''' find the substring P, all occurances ''' sa = tks.simple_kark_sort(sStr) m = len(P) n = len(sStr) start, end = -1, -1 # lower bound left, right = 0, n # length of sa is n+1 while left < right: mid = (left + right) >> 1 comp = cmp(sStr[sa[mid]:sa[mid] + m], P) if comp >= 0: right = mid else: left = mid + 1 start = left if sStr[sa[left]: sa[left] + m] != P: return [] # upper bound left, right = 0, n # length of sa is n+1 while left < right: mid = (left + right) >> 1 comp = cmp(sStr[sa[mid]:sa[mid] + m], P) if comp > 0: right = mid else: left = mid + 1 end = left result = [sa[i] for i in range(start, end)] result.sort() return result
def getBWTAndSA(s, psa): sa = tks.simple_kark_sort(s) bwt = [0] * len(sa) for i in range(len(sa)): bwt[i] = s[(sa[i] - 1) % len(s)] newsa = subsampleArray(sa, psa) return (bwt, newsa)
def get_pair_longest_overlap(fragments, min_overlap): ''' Generator returning maximum overlap matches betwen pairs of fragments. Algorithm: Concatenate fragments + labels into a single string Build a suffix array from string Compute the longest common prefix (LCP) for each element in the array Sort LCP array by size of LCP for each element in sorted LCP: where the LCP is greater than the minimum overlap Extract the LCP element label and the label of the following element. These two elements have the largest overlap in the suffix array so yield them. ''' # Build the concatenated fragment + label string. concat_frags = build_fragment_str(fragments) # Build a suffix array via the karkkainen sanders algorithm # Then compute the longest common prefixes sa = tks.simple_kark_sort(concat_frags) lcp = tks.LCP(concat_frags,sa) # Sort the LCP by size largest to smallest. sorted_lcp = sorted(enumerate(lcp),key=lambda x:x[1], reverse=True) # Iterate through sorted LCP list. for cur_lcp_pos, max_lcp_val in sorted_lcp: # If the overlap of this LCP entry is smaller than the minimum overlap # then stop yielding label pairs if max_lcp_val < min_overlap: break # Step through contiguous elements in the suffix array and extract # labels. labels = [] while len(labels) < 2: # Labels are integers prefixed with "$$$" and followed by "!!!" label_start = concat_frags.find('$$$', sa[cur_lcp_pos]) if label_start < 0: break label_start += 3 label_end = concat_frags.find('!!!', label_start) # Extract the label and convert from string to int label = int(concat_frags[label_start: label_end]) labels.append(label) cur_lcp_pos += 1 # If the two entries in the suffix array come from the same fragment # then go to the next highest LCP entry. if len(labels) < 2 or labels[0] == labels[1]: continue yield labels[0], labels[1]
def get_sa_lcp(s): s = unicode(s,'utf-8','replace') n = len(s) sa = tks.simple_kark_sort(s) lcp = tks.LCP(s,sa) #print sa #print lcp # return is special because their sizes are not exactly len(s) return (sa[:len(s)],lcp[:len(s)])
def __init__(self, s) : self.s = s self.n = len(s) _,sa = tks.simple_kark_sort(s) self.sa = sa[:self.n] # self.lcp = tks.LCP(s,self.sa) self.bwt = ''.join([s[self.sa[i]-1] for i in xrange(self.n)]) self.init_bwt() self.init_isa()
def longest(s): ''' longest repeating substring ''' sa = tks.simple_kark_sort(s) lcp = tks.LCP(s, sa) maxI, maxV = -1, -1 for i, v in enumerate(lcp): if v > maxV: maxI, maxV = i, v return s[sa[maxI]:sa[maxI] + maxV]
def longest(s): n = len(s) ns, sa = tks.simple_kark_sort(s) lcp = tks.LCP(s, sa) maxI, maxV = -1, -1 for i, v in enumerate(lcp): if v > maxV: maxI, maxV = i, v count = 1 for i, v in enumerate(lcp): if s[sa[i]] == s[sa[maxI]] and v == maxV: count += 1 return count, s[sa[maxI]:sa[maxI] + maxV]
def get_seq(fasta: str): """ Permet d'avoir la sequence sans prendre en compte la premiere ligne du fichier fasta commençant par ">". Elle va ouvrir le fichier entré en paramètre puis lire la deuxième ligne et en faire le suffixe array :param fasta: sequence fasta de reference :return: La sequence du genome de reference et le suffixe array de cette sequence """ with open(fasta) as fasta_file: # ouverture du fichier fasta for line in fasta_file: # lecture du fichier fasta if line[0] != ">": # Première ligne commençant pas ">" ignoré s = line.strip() + "$" # stockage de la séquence sa = tks.simple_kark_sort(s) # stockage du suffix array sa return s, sa
def search(P, sStr): ''' find first substring P ''' sa = tks.simple_kark_sort(sStr) m = len(P) n = len(sStr) left, right = 0, n # length of sa is n+1 while left < right: mid = (left + right) >> 1 comp = cmp(sStr[sa[mid]:sa[mid] + m], P) if comp >= 0: right = mid else: left = mid + 1 if sStr[sa[left]: sa[left] + m] == P: return sa[left] else: return -1
#Initialization of reads and readsInv, its reverse complementary reads, readsBioPalind = [], [] for line in open(readsfile, "r"): if line[0] != ">": #lines with > do not contain sequences, but merely comments about the sequences. reads.append( line[:-1].lower() ) #-1 to remove \n. To lower case for practical reasons when calling posdict. readsBioPalind.append(biologicalPalyndrome( line[:-1].lower())) #We also stock the biological palyndromes #We create SA, BWT, Rank and F from reference print("generating SA") startChrono() refSA = tks.simple_kark_sort(reference) print(" done in " + str(endChrono()) + " s") print("generating BWT") startChrono() refBWT = getBWT(reference, refSA) print(" done in " + str(endChrono()) + " s") print("generating ranks") startChrono() refRank = getRank(refBWT) print(" done in " + str(endChrono()) + " s") print("generating F") startChrono() refF = getF(refBWT) print(" done in " + str(endChrono()) + " s") """""" """""" """ usefull functions
#!/usr/bin/env python # -*- coding: utf-8 -*- import tools_karkkainen_sanders as tks #s = open('ooo.txt').read() #s = open('Python.htm').read() s = 'ab'*10000 #s = 'abab' s = unicode(s,'utf-8','replace') n = len(s) ns, sa = tks.simple_kark_sort(s) lcp = tks.LCP(s,sa) #print sa #print lcp #print sa for i in xrange(n-1) : if(s[sa[i]:] > s[sa[i+1]:]) : print s[sa[i]:][:40] print s[sa[i+1]:][:40] print '='*50
import tools_karkkainen_sanders as tks import sys sys.stdin = open('input.txt') numTest = int(input()) for itertest in range(numTest): line = raw_input().strip() m = len(line) line = line * 2 SA = tks.simple_kark_sort(line) for v in SA: if v < m: print v + 1 break
def build(self): self.sa = tks.simple_kark_sort(self.corpus_str) self.lcp = tks.LCP(self.corpus_str, self.sa)
def construct_suffix_array(x): x = unicode(x,'utf-8','replace') n = len(x) return tks.simple_kark_sort(x)[0:n]
#!/usr/bin/env python # -*- coding: utf-8 -*- import tools_karkkainen_sanders as tks s = 'ababbbaaabbaddavvabba' s = unicode(s, 'utf-8', 'replace') n = len(s) sa = tks.simple_kark_sort(s) lcp = tks.LCP(s, sa) print sa print lcp # 1/0 for i in xrange(n - 1): # if s[sa[i]:] > s[sa[i+1]:]: print s[sa[i]:][:40] #print s[sa[i+1]:][:40] #print '='*50
line = raw_input().strip() if line == 'END TDP CODEBASE': break tdpStr.append(line + '\n') jcnStr = [] raw_input() while True: line = raw_input() if line.strip() == 'END JCN CODEBASE': break jcnStr.append(line + '\n') N = len(''.join(jcnStr)) M = len(''.join(tdpStr)) S = ''.join(jcnStr) + '$' + ''.join(tdpStr) SA = tks.simple_kark_sort(S) LCP = tks.LCP(S, SA) sortedLCP = [] L = N + M + 1 for i in range(L): for j in range(i, L): begin = SA[i] end = SA[j + 1] if (begin > N and end < N) or (begin < N and end > N): sortedLCP.append((-1 * min(LCP[i:j + 1]), min(begin, end))) sortedLCP.sort() used = [] for dup in sortedLCP: if K == 0:
if format(args) != 'Namespace()': ref = open(str(args.ref), 'r') ################READ AND KEEPBACK SEQUENCE OF INPUT FILE################ """ To use the Burrows Wheeler algorithm we need to add a "$" at the end of sequence to be usable by the other functions """ sequence = '' for line in ref: line = str(line).replace('\n', '') ##add to delete line break if '>' not in line: sequence = str(line) + "$" ################SA[i] KEEP BACK################ sa = tks.simple_kark_sort( sequence ) ##keep back of SA[i] calculated thanks to tools_karkkainen_sanders get_BWT(sequence, sa) ################CREATION OF OUT FILE################ """ Save the index in a file as dataframe format, path indication in argues at the beginning """ d = {'SA[i]': sa, 'BWT': get_BWT(sequence, sa)} df = pd.DataFrame(data=d) df.to_csv(str(args.out), encoding='utf-8', index=False, mode='w', header=True)