def main(): """ runs main script """ # argv input list input_files = sys.argv flu_fasta = input_files[1] rhino_fasta = input_files[2] patient_fastq = input_files[3] flu_fasta_ref = list(SeqIO.parse(flu_fasta, "fasta")) rhino_fasta_ref = list(SeqIO.parse(rhino_fasta, "fasta")) patient_fastq_reads = list(SeqIO.parse(patient_fastq, "fasta")) # creates array of string objects of the sequences flu_seq = [str(flu.seq) for flu in flu_fasta_ref] rhi_seq = [str(rhi.seq) for rhi in rhino_fasta_ref] # creating the suffix tree object flu_suffix = STree.STree(flu_seq) rhi_suffix = STree.STree(rhi_seq) # arbitrary score, entire sequence must match to trigger flu_score = 0 rhi_score = 0 positive_strain_list = [] # goes through each contig, but sequence must match entirely for read in patient_fastq_reads: if flu_suffix.find_all(str(read.seq)): flu_score += 1 for record in flu_fasta_ref: if str(read.seq) in record.seq: positive_strain_list.append(str(record.description)) elif rhi_suffix.find_all(str(read.seq)): rhi_score += 1 for record in rhino_fasta_ref: if str(read.seq) in record.seq: positive_strain_list.append(str(record.description)) with open('patient-report.txt', 'w') as output: output.write("Sequencing results are detecting the following:\n") output.write('\n'.join(positive_strain_list)) if flu_score and not rhi_score: output.write("\nPatient is positive for the Influenza A virus.") elif rhi_score and not flu_score: output.write( "\nPatient is positive for the Human Rhinovirus Strain 89") elif not flu_score and not rhi_score: output.write( "\nInfluenza A virus and Human Rhinovirus not detected in patient." )
def common_patterns(self, sequences): list_of_sequences = [] # Just a bit of house keeping, the scores are in the form of lists but # I need them as strings in only this function. Hence the "hack-ish" code for sequence in sequences: voice = "".join(sequence) list_of_sequences.append(voice) # To be as fast as possible we window search the smallest string list_of_sequences = sorted(list_of_sequences, key=len) # Construct the genralised suffix tree suffix_tree = STree.STree(list_of_sequences) # List of common sequences matches = [] # search window size window_size = len(suffix_tree.lcs()) # Lower bound of the search window window_lower = 0 # Upper bound of the search window window_upper = window_lower + window_size # If window size is 0 then suffix_tree.lcs() returned "" if window_size == 0: print('no match') elif suffix_tree.lcs() == list_of_sequences[0]: matches.append(suffix_tree.lcs()) return matches # If there was a match then append it else: matches.append(suffix_tree.lcs()) # Do this until the window size is 0 while window_size != 0: # Create a copy of the sequence list list_of_sequences_copy = list_of_sequences[:] # Redefine the first element of the copy to only the elements in the window list_of_sequences_copy[0] = list_of_sequences[0][ window_lower:window_upper] # Rebuild the suffix tree suffix_tree = STree.STree(list_of_sequences_copy) # If an unseen match is found and is not empty then append it if not suffix_tree.lcs() in matches and suffix_tree.lcs( ) != "" and len(suffix_tree.lcs()) >= 5: matches.append(suffix_tree.lcs()) # shift the window one to the right window_lower += 1 window_upper += 1 # Once the upper bound of the window touches the end or the string if window_upper == len(list_of_sequences[0]) - 1: # Decrease the window size by one window_size -= 1 # Reset the window back to the start window_lower = 0 window_upper = window_lower + window_size return matches
def multiclusterConversion(labels, n=2): #get alphabet sequence alphabet = string.ascii_lowercase #get alphabet sequence keys into the clustering sequence (0->a, 1->b...) str_labels = np.array([("".join(str(a) for a in label)) for label in labels.T]) code = {key: alphabet[i] for i, key in enumerate(set(str_labels))} str_decoded = [code[code_i] for code_i in str_labels] #rle decoded_join, rle_decoded, rep_decoded = runLengthEncoding(str_decoded) decoded_wordLst, ngrams_decoded = Ngrams(rle_decoded, n=n) pos_decoded = NgramsPos(rep_decoded, n=n) #bow bow = BagofWords(ngrams_decoded) mre_item = max(bow, key=lambda k: bow[k]) print(mre_item) #suffix_tree st = STree.STree(ngrams_decoded) indexs = st.find_all(mre_item) return ngrams_decoded, pos_decoded, indexs
def arbol(txt, findTxt): st = STree.STree(txt) first = st.find(findTxt) print("El primer patron se encuentra en la posicion: {}".format(first)) all = st.find_all(findTxt) print("Todas las posiciones: {}".format(all))
def find_file_index_lcs(file, byte_array, lcs): file_st = STree.STree(file) start_of_lcs_in_file = file_st.find(lcs) index = 0 count = 0 while count < start_of_lcs_in_file: substring = re.split("'", str(byte_array[index])) byte = substring[1] for b in byte: count += 1 index += 1 start_index = index while count < (start_of_lcs_in_file + len(lcs)): substring = re.split("'", str(byte_array[index])) byte = substring[1] for b in byte: count += 1 index += 1 end_index = index indices = [start_index, end_index] return indices
def test_lcs(): a = [ "abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda" ] st = STree.STree(a) assert st.lcs() == "abeced", "LCS test"
def test_missing(): text = "name language w en url http w namelanguage en url http" stree = STree.STree(text) assert stree.find("law") == -1 assert stree.find("ptth") == -1 assert stree.find( "name language w en url http w namelanguage en url httpp") == -1
def createHeatMapColoring(self, template1, template2, no_sec_peak): k = 2 # get list with k-mere and their frequency struct_profile1 = self.getStructProfile1().getProfile() struct_kmer_list = [struct_profile1] template = [template1] struct_profile_obj2 = self.getStructProfile2() if struct_profile_obj2 is not None: struct_profile2 = struct_profile_obj2.getProfile() struct_kmer_list.append(struct_profile2) template.append(template2) norm_vector = self.getNormVector() result = [] for i in range(0, len(struct_kmer_list)): current_template = template[i] current_profil = struct_kmer_list[i] # create suffix-tree to find k-mer position in template template1_s_tree = STree.STree(current_template) color_hm1 = {str(i): 0 for i in range(1, len(current_template) + 1)} color_hm1, not_matched_kmer1, color_domain_max1 = createColorVector(k, template1_s_tree, current_profil, color_hm1, no_sec_peak, norm_vector) result.append([color_hm1, color_domain_max1, not_matched_kmer1]) return result
def build_One_STree(text): ''' 将所有text建立一棵树, 此时树过大, 无法存入磁盘文件 ''' start = time.time() st = STree.STree(text) print('Build Tree Total Time: ', time.time() - start) return st
def occurrence_of_string_sequence (strings, min_len): st = STree.STree(strings) longest = st.lcs() if len(longest) >= min_len: occurrences = st.find_all(longest) return len(occurrences) else: return 0
def find_longest_repeat(ps): """ Finds the longest repeated sub-sequence in the given string using a suffix-tree. """ from suffix_trees import STree st = STree.STree(ps) deepest = max(st.root._get_leaves(), key=lambda x: x.parent.depth).parent.depth return deepest
def tf(self): # tfList = [] tf_dict = {} unique = [x for i, x in enumerate(self) if i == self.index(x)] tf_dict = dict.fromkeys(unique, 0) tree = STree.STree(self) for i in unique: tf_dict[i] = len(tree.find_all(i)) for i in tf_dict: tf_dict[i] = tf_dict[i] / float(len(self)) return tf_dict
def smallest_k(sequences): t0 = datetime.now() print(t0.time(), " experiment start") sequences = [s.seq._data for s in sequences] print(datetime.now() - t0, " building tree...") tree = STree.STree(sequences) print(datetime.now() - t0, " finished building tree, bfs...") ret = bfs_find_shortest_uncommon_substring(tree, len(sequences)) print(datetime.now() - t0, " finished all!") return ret
def is_strand_in_all_files(files, strand): j = 0 indices = [] for i in range(0, 10): st = STree.STree(files[i][3]) index = st.find_all(strand) if len(index) > 0: strand_start_end = find_file_index_lcs(files[i][3], files[i][2], strand) element = [i + 1, strand_start_end[0]] indices.append(element) # print(indices) return indices
def longest_common_subsequence_size(self, other): uniques = list( set([x.value for x in self.trace_files] + [x.value for x in other.trace_files])) uniques = sorted(uniques) mine = "".join([ chr(ord('a') + bisect.bisect_left(uniques, x.value)) for x in self.trace_files ]) theirs = "".join([ chr(ord('a') + bisect.bisect_left(uniques, x.value)) for x in other.trace_files ]) st = STree.STree([mine, theirs]) common = st.lcs() return len(common)
def build_N_STree(text, cut_point=30): N = len(text) // cut_point if len( text) % cut_point == 0 else len(text) // cut_point + 1 strees = [] start = time.time() for i in range(N): if i >= N - 1: new_text = text[cut_point * i:] # print('最后一个树的文档数量:',len(new_text)) else: new_text = text[cut_point * i:cut_point * i + cut_point] strees.append(STree.STree(new_text)) strees.append(cut_point) # print('STrees Number: ', len(strees)) print('Build %d Trees, ' % N, 'Total Build Time: ', time.time() - start) return strees
def main(): """ runs main """ # requires the fasta/txt file is in the same directory fasta_text = sys.argv[1] fasta_list = list(SeqIO.parse(fasta_text, "fasta")) sequence_list = [] for record in fasta_list: sequence_list.append(str(record.seq)) st = STree.STree(sequence_list) with open('output-lcs.txt', 'w') as output: output.write(st.lcs())
def __construct_suffix_trees(sample_id, proteins): ''' Constructs a file on disk in data folder with name sample_id.pkl Iterates through all the proteins and creates a map between protein_ids and suffix trees of these sequences ''' tree_map = {} for protein in proteins: protein_id = str(protein.protein_id) protein_seq = str(protein.protein_seq) tree_map[protein_id] = STree.STree(protein_seq) file_location = os.path.join(BASE_DIR, "data/" + sample_id + ".pkl") with open(file_location, 'wb') as f: pickle.dump(tree_map, f) return tree_map
def test_vectors(): if not os.getenv("SUFFIX_TREES_TEST_VECTORS") == "1": pytest.skip( "skipping vectors test. Set SUFFIX_TREES_TEST_VECTORS=1 to run.") with open(os.path.join(os.path.dirname(os.path.abspath(__file__)), INPUT), 'r') as f: for line in f: splitted = line.split(",") search_string = splitted[0] # Build tree on the search string. st = STree.STree(search_string) # Test cases for case in splitted[1:-1]: [s, res] = case.split(":") assert int(res) == st.find( s ), "Search string: {}, Test case: {}, Expected: {}".format( search_string, s, int(res))
def _load(self, dic_name, debug=False): """reads the .txt files from the given directory and builds the ngram and suffix tree data structures necessary for the search, as well as the conversion dictionary""" total = "" tindex = 0 total_dict = {} n = set() v, i, a = (0, 0, 0) r = re.compile("(\d\d)(\d\d)(\d\d\d).txt") for filename in sorted(os.listdir(dic_name)): #for filename in sorted(listdir('Shaker_Manifesto', dic_name)): index = 0 if filename.endswith(".txt"): if debug: print("Processing {}".format(filename)) v, i, a = map(int, r.match(filename).groups()) w = '' with open(os.path.join(dic_name, filename), "rb") as file: #with stream(__name__, "{}/{}".format(dic_name, filename)) as file: for c in file.read().decode("utf8", errors="replace").lower(): if c in " \t\n\r\ufffd,./?\'\";:<>[]{}\\|+=_-()*&^%$#@!~`": if total[-1] == ' ': index += 1 continue else: total += ' ' n.add(w) w = '' else: total += c w += c total_dict[tindex] = (v, i, a, index) tindex += 1 index += 1 self._tree = STree.STree(total) self._index_dict = total_dict self._ngram = NGram(n)
def find_longest_strand_in_two_or_more_files(files): sorted_files = sorted(files, key=itemgetter(1), reverse=True) # Largest to smallest file files_array = [] files_byte_array = [] files_indices = [] for i in range(0, 10): files_array.append(sorted_files[i][3]) files_byte_array.append(sorted_files[i][2]) files_indices.append(sorted_files[i][0]) longest_length = 0 longest_substr = "" files_found = [] offsets = [] # print_lengths_of_arrays(files_byte_array) broken = False for i in range(0, 10): for j in range(i + 1, 10): if longest_length < len( files_byte_array[i]) and longest_length < len( files_byte_array[j]): arry = [files_array[i], files_array[j]] suffix_tree = STree.STree(arry) lcs = suffix_tree.lcs() file_1_index = find_file_index_lcs(files_array[i], files_byte_array[i], lcs) file_2_index = find_file_index_lcs(files_array[j], files_byte_array[j], lcs) length = -1 if (file_1_index[1] - file_1_index[0] != file_2_index[1] - file_2_index[0]): print("LCS bytes are not equal!!") else: length = file_1_index[1] - file_1_index[0] if length > longest_length: longest_length = length longest_substr = lcs files_found = [files_indices[i], files_indices[j]] offsets = [file_1_index[0], file_2_index[0]] elif length == longest_length: indices = is_strand_in_all_files(files, lcs) if len(indices) > len(files_found): print("More files with this strand!") longest_substr = lcs files_found = [] offsets = [] for index in indices: files_found.append(index[0]) offsets.append(index[1]) elif longest_length > len(files_byte_array[i]): broken = True else: break if broken: break indices = is_strand_in_all_files(files, longest_substr) files_found = [] offsets = [] for index in indices: files_found.append(index[0]) offsets.append(index[1]) length_filenames_offsets = { "length": longest_length, "file names": files_found, "offsets": offsets } # print(length_filenames_offsets) # check_lcs_of_finds(files, files_found, offsets, longest_length) return length_filenames_offsets
# print(s1) f = open("BS19B031_Q1.txt", 'r') text2 = f.read() text2 = text2.lower() tokenizer = RegexpTokenizer(r'\w+') text2 = tokenizer.tokenize(text2) s2 = "" for i in text2: s2 += i # print(s2) mark = np.zeros(len(text2)) k = 4 temp = "" st = STree.STree(s1) for i in range(len(text2) - k): temp = "" for j in range(k): temp += text2[i + j] if st.find(temp) != -1: for j in range(k): mark[i + j] = 1 ans = "" for i in range(len(text2)): if mark[i] == 1: ans += text2[i] ans += " " else: if ans != "":
def tree_test(): a = ["rpazu", "zupa"] tree = STree.STree(a)
def make_tree(self, ex): ex['st'] = STree.STree(ex['text'].lower()) return ex
import string import random def id_generator(size=6, chars=string.ascii_uppercase + string.digits): return ''.join(random.choice(chars) for _ in range(size)) from suffix_trees import STree if __name__ == '__main__': a = [ "abeceda", "abecednik", "abeabecedabeabeced", "abecedaaaa", "aaabbbeeecceeeddaaaaabeceda" ] st = STree.STree(a) print(st.lcs()) text = "name language w en url http w namelanguage en url http" stree = STree.STree(text) print(stree.find('law')) st = STree.STree("abcdefghab") print(st.find("abc")) # 0 print(st.find_all("ab")) # [0, 8] ---> [] :-(
def get_lcs(chunk_hashes): chunk_hashes_input = ['$'.join(chunk_hash) for chunk_hash in chunk_hashes] st = STree.STree(chunk_hashes_input) return st.lcs()
for j in range(i, n): if i != j: temp = st.lcs(stringIdxs=[i, j]) if len(temp) > len(lcs): lcs = temp index1 = i index2 = j return (lcs, index1, index2) def get_offset(n, lcs): offset = [] for i in range(0, n): os = ar[i].find(lcs) offset.append(os) return offset ar = init(10) #read into files st = STree.STree(ar) #build suffix tree lcs1 = st.lcs() #get lcs accorss all strings (for curiosity) # print("Length of byte strands across all files ", len(lcs1), " ", lcs1) lcs2, index1, index2 = get_lcs_between2( 10, st) #actual longest strings between 2 files offsets = get_offset(10, lcs2) print("Length of most common strand %d" % len(lcs2)) print("File sample.%d offset at %d" % (index1 + 1, offsets[index1])) print("File sample.%d offset at %d" % (index2 + 1, offsets[index2]))
""" requirements: ``` $ pip install git+https://github.com/Nanguage/suffix-trees ``` """ import os from suffix_trees import STree, visualize from Bio.Seq import reverse_complement if __name__ == "__main__": s1 = "AAAACGTCGGGATCG" s2 = "GGGCGTAAAGCTCT" T = STree.STree([s1, s2]) Tv = visualize.VisualizeTree(T) dot = Tv.to_graphviz() dot.save("img/q3.dot") os.system("dot -Tpdf img/q3.dot > img/q3.pdf") s1_rc = reverse_complement(s1) s2_rc = reverse_complement(s2) T = STree.STree([s1_rc, s2_rc]) Tv = visualize.VisualizeTree(T) dot = Tv.to_graphviz() dot.save("img/q3-rc.dot") os.system("dot -Tpdf img/q3-rc.dot > img/q3-rc.pdf")
) #the n + 1 note is to be trained at if n > 1: #note estmation based on simple syntax analysis current_note = digits_to_str( [x[2 * j + index - 1][n - 1]]) for p, q in enumerate( x[2 * j + index - 1] ): #getting a list of all notes played if digits_to_str([ q ]) not in letters and p < n and q != 0: letters.append(digits_to_str([q])) numbers.append(q) freq = [0] * len(letters) st = STree.STree( digits_to_str( x[2 * j + index - 1] [0:n])) #building the suffix tree for q, let in enumerate(letters): tmp = st.find_all( current_note + let ) #determining how frequent the current_note+other_note combination is if isinstance(tmp, list): freq[q] += len(tmp) else: freq[q] += 1 x[2 * j + index - 1][n] = numbers[freq.index( max(freq) )] #adding the most frequent as another feature letters = [] numbers = []
""" requirements: ``` $ pip install git+https://github.com/Nanguage/suffix-trees ``` """ import os from suffix_trees import STree, visualize if __name__ == "__main__": s1 = "ACGT" s2 = "TGCA" T = STree.STree([s1, s2]) Tv = visualize.VisualizeTree(T) dot = Tv.to_graphviz() dot.save("img/q1.dot") os.system("dot -Tpdf img/q1.dot > img/q1.pdf")