def get_partial_suffix_array(text, k): tree = SuffixTree( len( text ) ) for char in text: tree.add_char( char ) suffix_array = get_suffix_array( tree ) partial_suffix_array = [] for i in range( len(suffix_array) ): if suffix_array[i] % k == 0: partial_suffix_array.append( (i, suffix_array[i]) ) return partial_suffix_array
def _addToTree(self, key): """Adds a key into one of the SuffixTrees, and returns the string index of that key. There's a reason why this insertion is a little complicated: SuffixTree structure can hold, at most, strmat.MAXNUMSTR strings and strmat.MAXNUMNODES nodes. If we go beyond that, the add() will fail. To solve this, we use multiple trees.""" (index, self._next_index) = (self._next_index, self._next_index + 1) ok = self._trees[-1].add(key, index) if ok: return index self.debug("Now expanding the tree:") self._trees.append(SuffixTree()) ok = self._trees[-1].add(key, index) assert (ok) return index
def test_empty_string(self): st = SuffixTree('') self.assertEqual(st.find_substring('not there'), -1) self.assertEqual(st.find_substring(''), -1) self.assertFalse(st.has_substring('not there')) self.assertFalse(st.has_substring(''))
def test_case_sensitivity(self): f = open("test.txt") st = SuffixTree(f.read(), case_insensitive=True) self.assertEqual(st.find_substring('ukkonen'), 1498) self.assertEqual(st.find_substring('Optimal'), 1830)
def test_long_string(self): f = open("test.txt") st = SuffixTree(f.read()) self.assertEqual(st.find_substring('Ukkonen'), 1498) self.assertEqual(st.find_substring('Optimal'), 11131) self.assertFalse(st.has_substring('ukkonen'))
def test_repeated_string(self): st = SuffixTree("aaa") self.assertEqual(st.find_substring('a'), 0) self.assertEqual(st.find_substring('aa'), 0) self.assertEqual(st.find_substring('aaa'), 0) self.assertEqual(st.find_substring('b'), -1) self.assertTrue(st.has_substring('a')) self.assertTrue(st.has_substring('aa')) self.assertTrue(st.has_substring('aaa')) self.assertFalse(st.has_substring('aaaa')) self.assertFalse(st.has_substring('b')) #case sensitive by default self.assertFalse(st.has_substring('A'))
## PS4 problem 1: suffix bwt algorithm / Burrows Wheeler Transform (BWT) ## The aim of BWT is to map millions of short reads to the genome in O(m) time ## where m is the length of the short read from SuffixTree import SuffixTree if __name__ == "__main__": genome = "MANOLISKELLIS" bwt = SuffixTree(genome) sp, ep, bwtsp, bwtep = bwt.findMatch("OLAS") if sp > 0 and ep > 0: print("Table ptr locations:") print("sp = ", sp, ", ep = ", ep) print("-----") locations = bwt.getGenomeLocations(bwtsp, bwtep) print("Genome locations:") print("bwtsp = ", bwtsp, ", bwtep = ", bwtep) print(genome) i, j = 0, 0 while j < len(genome) and i < len(locations): if locations[i] == j: print("^", end='') i += 1 else: print(" ", end='') j += 1
def __init__(self, debug_flag=0): self._trees = [SuffixTree()] self._dict = {} self._debug_flag = debug_flag self._next_index = 1
def get_suffix_array_from_text(text): tree = SuffixTree(len(text)) for char in text: tree.add_char(char) return get_suffix_array(tree)
edges.sort() for e in edges: n = tree.nodes[e[1]] child_depth = depth + n.end - n.start if len(n.edges) == 0: suffix_array.append(n.start - depth) build_suffix_array(n, child_depth) build_suffix_array(tree.nodes[tree.root], 0) return suffix_array def get_suffix_array_from_text(text): tree = SuffixTree(len(text)) for char in text: tree.add_char(char) return get_suffix_array(tree) if __name__ == "__main__": with open(sys.argv[1]) as fh: text = next(fh).strip() tree = SuffixTree(len(text)) for char in text: tree.add_char(char) print ", ".join([str(x) for x in get_suffix_array(tree)])
from Regice import Regice from SuffixTree import SuffixTree regice = Regice() filepath = './sample/index.html' # tokenizeしたものを木にして tokens = regice.tokenize_from_file('./sample/index.html') st = SuffixTree(tokens) tokens2 = regice.tokenize_from_file('./sample/test.html') print(tokens2) num_token = len(tokens2) res = [] for i in range(3, num_token): for j in range(num_token - i): res.append(tokens2[j:j+i]) print(j, i) print([pattern for pattern in st.search_pattern_all(tokens2[j:j+i])]) # regice.analyze(filepath) # regice.all_similaries()
from DNAHelper.helper_functions import * from SuffixTree import SuffixTree alphabet = 'AGTCN' text = read_genome('../Dataset/phix.fa') reads, qualities = read_fastq('../Dataset/phix.fastq') tree = SuffixTree(alphabet) tree.build_generalized_suffix_tree([text]) count = 0 matches = [] for read in reads: read = read[:30] matches = tree.find(read) matches.extend(tree.find(reverse_complement(read))) if len(matches) > 0: count += 1 print(count, '/', len(reads), 'matched!')
def map_dna(args): # Parse input parser = Parser() sequence_name, sequence = parser.parse_fasta_sequence(args[1]) read_file = args[2] k = int(args[3]) # Build Suffix Tree print("Building Suffix Tree...", ) suffix_tree = SuffixTree(sequence) print("DONE") print("Building Suffix Array...", ) suffix_array = suffix_array_from_suffix_tree(suffix_tree) print("DONE") del suffix_tree print("Building BWT...", ) bwt = bwt_from_suffix_array(suffix_array, sequence) print("DONE") print("Building First Occurrence Table...") first_occurrences = build_first_occurrence(bwt) print("DONE") print("Building Counts Table...") counts = build_counts(bwt) print("DONE") read_file_name = read_file.split(".")[0] timestamp = datetime.now().strftime("%Y-%d-%H-%M") file_name = "%s %s.SAM" % (sequence_name, timestamp) sam = SAM(filename=file_name, sequence_name=sequence_name, sequence_length=len(sequence)) reads = parser.parse_fasta_reads(read_file) # Map DNA sequence print("**Beginning Mapping Process") counter = 0 results = [] matches = {} for read_name, read in reads.iteritems(): counter += 1 kmer_indices = {} print("**Mapping Read " + str(counter) + " ") max_index = -1 max_score = -1 candidate_mapping_indices = {} kmer_start_index = 0 kmer_end_index = kmer_start_index + k while kmer_end_index - 1 <= len(read): kmer = read[kmer_start_index:kmer_end_index] relative_index = kmer_start_index if kmer not in matches: matches[kmer] = find_pattern_matches(kmer, suffix_array, bwt, first_occurrences, counts) if matches[kmer]: # Perform pattern matching using suffix_tree with each kmer storing results for each matching kmer index relative # to its location within the read, i.e. matching_index - relative_index = potential_read_mapping_index for matching_index in matches[kmer]: potential_read_index = matching_index - relative_index if potential_read_index >= 0: if potential_read_index not in candidate_mapping_indices: candidate_mapping_indices[potential_read_index] = 0 candidate_mapping_indices[potential_read_index] += 1 if candidate_mapping_indices[ potential_read_index] > max_score: max_index = potential_read_index max_score = candidate_mapping_indices[ potential_read_index] kmer_start_index += k kmer_end_index += k else: kmer_start_index += 1 kmer_end_index += 1 sam.append_sam_output(read_name=read_name, cigar="%dM" % len(read), sequence_name=sequence_name, position=max_index + 1) print("**MAPPING COMPLETE**")
def __init__(self, code): self.bs = bs4.BeautifulSoup(code, 'html.parser') self.tokens = self.tokenize(self.bs.body) self.start_tags, self.tokens_val = self.extract_tokens(self.tokens) self.st = SuffixTree(self.tokens_val)