def run_ctrie(ngram_list, write_file): t = trie.trie() print('create trie...') beg = time.time() for ngram in ngram_list: sub = t.setdefault(ngram, 0) sub.data += 1 print('time=', time.time() - beg, 's') print('Search...') beg = time.time() for i in range(find_epoch): for ngram in ngram_list: sub = t.find_trie(ngram) sub.data += 1 print('time=', time.time() - beg, 's') print('write1...') beg = time.time() with open(write_file + '.1.txt', 'wt') as f: for keys, data in trie.TrieIter(t, True): f.write(' '.join(str(i) for i in keys) + '\t{}\n'.format(data)) with open(write_file + '.2.txt', 'wt') as f: for n in range(1, max_order + 1): for keys, data in trie.LevelIter(t, n, True): f.write(' '.join(str(i) for i in keys) + '\t{}\n'.format(data)) print('time=', time.time() - beg, 's')
def test_basic(self): t = trie() t.add('next') t.add('nexas') find = t.search('next') self.assertTrue(find) self.assertTrue(t.search('nexas')) #search substring self.assertFalse(t.search('nexa')) #search string that is not in the tree self.assertFalse(t.search('null'))
def main(): language_trie = trie.trie( "input.txt") # To load trie from scheme file. clean code. tested level_2_list = get_finalized_suggestion(language_trie, arginp, argcount, "bigram_mal_corpus.txt") #training_phase() #test(language_trie,"malayalam.txt",module1=True) symspell_python.create_dictionary("malayalam.txt") #print(symspell_python.dictionary) for ii in level_2_list: ##print(mlphone_calculator(ii[0])) print(symspell_python.get_suggestions(ii[0]))
def test_delete(self): t = trie() t.add('next') t.add('nexas') self.assertTrue(t.delete('nexas')) self.assertFalse(t.delete('nexas')) self.assertFalse(t.delete('ne')) self.assertFalse(t.search('nexas')) #search substring of the one just deleted self.assertFalse(t.search('nexa')) #make sure not effecting other words self.assertTrue(t.search('next')) self.assertFalse(t.search('ne'))
def get_list_of_candidates(target_string, PAM, gRNA_length, exclude_stop_codons, consider_negative, alt_pams): target = target_string if consider_negative: target_rev = reverse_complement(target) if alt_pams is None: PAMs = NFiller(PAM).get_list() else: for pam in alt_pams: if len(pam) != 3: raise ValueError('Length of one or more PAMs not set to 3') for character in pam: if character != 'A' and character != 'C' and character != 'G' and character != 'T': raise ValueError('Invalid PAM has been entered') PAMs = alt_pams len_pams = 3 candidates_rev = [] candidates = [] for PAM in PAMs: candidates.extend(find_candidates(target, PAM, gRNA_length)) if consider_negative: candidates_rev.extend(find_candidates(target_rev, PAM, gRNA_length)) trie_dic = trie.trie() for candidate in candidates: key = candidate[1] if exclude_stop_codons and ('TAG' in key or 'TAA' in key or 'TGA' in key): continue candidate_position = candidate[0] if key not in trie_dic.keys(): trie_dic[key] = ['+', candidate_position] else: trie_dic[key].append(candidate_position) for candidate in candidates_rev: if exclude_stop_codons and ('TAG' in candidate[1] or 'TAA' in candidate[1] or 'TGA' in candidate[1]): continue key = reverse_complement(candidate[1]) candidate_position = len( target_string) - candidate[0] - gRNA_length - len_pams if key not in trie_dic.keys(): trie_dic[key] = ['-', candidate_position] else: trie_dic[key].append(candidate_position) return trie_dic
def test(): wordsFilename = "top100k.txt" tagsFilename = "skip_hashtags-dev.txt" t = trie.trie() t.insert_file(wordsFilename) matches = [] hash_line = "" for line in open(tagsFilename): if (line[0] == "#"): hashtag = line[1:-1] # strip leading pound and newline hash_line = hashtag matches = max_match_hashtag(hashtag, t) print hash_line if matches: for match in matches: print " ", match
def test(): wordsFilename = "top100k.txt" tagsFilename="skip_hashtags-dev.txt" t = trie.trie() t.insert_file(wordsFilename) matches=[] hash_line="" for line in open(tagsFilename): if (line[0] == "#"): hashtag = line[1:-1] # strip leading pound and newline hash_line = hashtag matches=max_match_hashtag(hashtag,t) print hash_line if matches: for match in matches: print " ", match
def find_family(self): family_list = {} search_tree = trie() skus = self.qb_df.index.values for sku in skus: search_tree.insert(sku) family_sku = self.qb_df.ix[sku, 'Family sku'] if pd.notna(family_sku): family_list[sku] = family_sku.split(", ") for i in skus: prefix_i = self.find_family_prefix(i) for j in skus: prefix_j = self.find_family_prefix(j) if i != j and search_tree.find_prefix(prefix_j) and prefix_i == prefix_j: if i not in family_list: family_list[i] = [] family_list[i].append(j) self.family = family_list
def build_kmers_tries(kmers_filename, goodkeys_filename, badkeys_filename, kmers_trie_filename, genome, altpam, pampos, maxcount, n): util.check_file_exists(kmers_filename) if goodkeys_filename: goodkeys = gzip.open(goodkeys_filename, 'w') if badkeys_filename: badkeys = gzip.open(badkeys_filename,'w') kmers_trie = trie.trie() f = gzip.open(kmers_filename) for line in f: kmer, coord = line.strip().split() kmer2 = kmer[n:] if kmers_trie.has_key(kmer2): arr = kmers_trie[kmer2] if len(arr) < maxcount + 1: coord_int = util.map_coord_to_int(coord, genome) arr = np.append(arr, coord_int) arr[0] = len(arr) - 1 kmers_trie[kmer2] = arr else: coord_int = util.map_coord_to_int(coord, genome) label = 0 if pampos == 'start' and any(kmer.startswith(p) for p in altpam): label = 1 if pampos == 'end' and any(kmer.endswith(p) for p in altpam): label = 1 kmers_trie[kmer2] = np.array([label, coord_int]) if label == 0: goodkeys.write('%s\n' % kmer) if label != 0: badkeys.write('%s\n' % kmer) save_single_trie(kmers_trie, kmers_trie_filename) goodkeys.close() badkeys.close() f.close()
def test_with_gs(): wordsFilename = "test_vocabulary.txt" tagsFilename="hashtags-test.txt" t = trie.trie() t.insert_file(wordsFilename) gs=[] gs_line="" matches=[] hash_line="" for line in open(tagsFilename): if (line[0] == "$"): gs_line=line[1:-1] gs = gs_line.split(',') if (line[0] == "#"): hashtag = line[1:-1] # strip leading pound and newline hash_line = hashtag matches=max_match_hashtag(hashtag,t) if gs and matches: print "-->", hash_line print "GS>", gs_line n = min(len(matches), len(gs)) for i in xrange(0,n): print "T:", i, " " + matches[i] + " " + gs[i] if (matches[i] != gs[i]): print "ERROR:" gs=[] gs_line="" matches=[] hash_line="" else: print hash_line if matches: for match in matches: print match
def test_with_gs(): wordsFilename = "test_vocabulary.txt" tagsFilename = "hashtags-test.txt" t = trie.trie() t.insert_file(wordsFilename) gs = [] gs_line = "" matches = [] hash_line = "" for line in open(tagsFilename): if (line[0] == "$"): gs_line = line[1:-1] gs = gs_line.split(',') if (line[0] == "#"): hashtag = line[1:-1] # strip leading pound and newline hash_line = hashtag matches = max_match_hashtag(hashtag, t) if gs and matches: print "-->", hash_line print "GS>", gs_line n = min(len(matches), len(gs)) for i in xrange(0, n): print "T:", i, " " + matches[i] + " " + gs[i] if (matches[i] != gs[i]): print "ERROR:" gs = [] gs_line = "" matches = [] hash_line = "" else: print hash_line if matches: for match in matches: print match
def child(M, X): """ Input: - M: context matrix - X: list of attributes Output: - res: list of potential children (obj, att) """ res = [] objX = common_objects(M, X) L = without(range(len(M[0])), X) t = tr.trie(-1, [], [], []) for i in L: obji = objr(M, i, objX) t.insert_trie(i, obji) S = t.equivalence() for s in S: res.append((s[0], sorted(X + s[1]))) # sorted ? return res
def child(M, X): """ Input: - M: context matrix - X: list of attributes Output: - res: list of potential children (obj, att) """ res = [] objX = common_objects(M, X) L = without(range(len(M[0])), X) t = tr.trie(-1,[],[],[]) for i in L: obji = objr(M, i, objX) t.insert_trie(i, obji) S = t.equivalence() for s in S: res.append((s[0], sorted(X+s[1]))) # sorted ? return res
def create_trie(filename): text = open(filename, buffering=1) # Create an empty trie called file_trie file_trie = trie() # Regex for parsing each line pattern = re.compile(r"^(\w+?)\s*?(\d+?)$") # Begin regular expressions on file line by line while(text): line = text.readline() if not line: break # Parse with Regex match = pattern.match(line) if match is not None: # Add each line's word and hits to trie file_trie.add_child(match.groups()[0], int(match.groups()[1])) text.close() return file_trie
def create_trie(filename): text = open(filename, buffering=1) # Create an empty trie called file_trie file_trie = trie() # Regex for parsing each line pattern = re.compile(r"^(\w+?)\s*?(\d+?)$") # Begin regular expressions on file line by line while (text): line = text.readline() if not line: break # Parse with Regex match = pattern.match(line) if match is not None: # Add each line's word and hits to trie file_trie.add_child(match.groups()[0], int(match.groups()[1])) text.close() return file_trie
log_score = False if log_score and no_cca: #if we are not writing scores, then log scores will be ignored sys.stderr.write("Warning! Ignoring log_score ('-l') option, since no_cca flag ('-c') is on\n") param_filename = args[0] output_dir = args[1] num_process = int(args[2]) param_fh = open(param_filename, 'rb') model = cPickle.load(param_fh) extractor = cPickle.load(param_fh) param_fh.close() phrase_pairs = ["[X] ||| " + pair for pair in model.get_tokens()] phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [1] [2]") phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [2] [1]") #dev_grammars=args[3] grammar_trie = trie(phrase_pairs) print "Data structures from training stage loaded" if discretize != "": #compute relevant statistics for discretization compute_feature_thresholds(model, discretize) ''' declaration of list that maintains which sentences have failed across all processes ''' def init(fs): global failed_sentences failed_sentences = fs def main(): failed_sentences = mp.Manager().list() pool = mp.Pool(processes=num_process, initializer=init, initargs=(failed_sentences,)) for sent_num, line in enumerate(sys.stdin):
import os import stat import re from directory_to_list import directory_to_list from trie import trie dirfilepath=input('type the filepath of the directory: ') dir_to_list=directory_to_list(dirfilepath) dir_to_list.setprefixes() myprefix=dir_to_list.get_myprefix() file_list=dir_to_list.get_file_list() our_trie=trie(myprefix,file_list) our_trie.make_trie() #The prefix/words to search a=[] p='' while(1): p=input('enter the prefixes , else to exit enter q: ') if(p=='q'): break else: a.append(p) our_trie.check_prefixes(a)
return path else: return '' def trie_matching(text, trie): occurences = defaultdict(list) for i in range(len(text)): postfix = text[i:] path = prefix_trie_matching(postfix, trie) if path: occurences[path].append(i) return occurences if __name__ == '__main__': inp, out = small_example() #inp, out = big_example() inp = read_dataset() root, edges = trie(map(Seq, inp[1:])) occurences = trie_matching(Seq(inp[0]), root) positions = '' for p in inp[1:]: if p in occurences: positions += ' '.join(map(str, occurences[p])) + '\n' #positions = prefix_trie_matching(Seq(inp[0]), root) write_result(positions)
for f in all_files: #read all the files with .sgm extension if f[-4:] == ".sgm": fi = open("reuters21578/" + f, mode='r', encoding='latin-1') text = fi.read() # print(f) while True: start = text.find("<REUTERS") end = text.find("</REUTERS>") #print(text[start:end]) if start == -1: break getWords(text[start:end]) text = text[end + 11:] tr = trie( ) #creates a trie object and adds the every word recorded in the files for word in inversedict: tr.add(word) triefile = open("trie.pickle", 'wb') #pickle the trie object for later use pickle.dump(tr, triefile) triefile.close() with open('invertedindex.json', 'w') as outfile: #json the invertedindex dictionary for later use json.dump(inversedict, outfile) print(inversedict.keys())
#!/usr/bin/python import trie search_structure = trie.trie() def readAndInsertWords(filename): fh = open(filename, "r") name_list = fh.readlines() for name in name_list: name = name[0:len(name) - 1] search_structure.insert(name) fh.close() def searchWord(word): return search_structure.search(word) readAndInsertWords("names.txt") if __name__ == "__main__": while True: word = raw_input("Enter word to be searched\n") ret_val = searchWord(word)
import os import stat import re from directory_to_list import directory_to_list from trie import trie dirfilepath = input('type the filepath of the directory: ') dir_to_list = directory_to_list(dirfilepath) dir_to_list.setprefixes() myprefix = dir_to_list.get_myprefix() file_list = dir_to_list.get_file_list() our_trie = trie(myprefix, file_list) our_trie.make_trie() #The prefix/words to search a = [] p = '' while (1): p = input('enter the prefixes , else to exit enter q: ') if (p == 'q'): break else: a.append(p) our_trie.check_prefixes(a)
def setUp(self): self.dict = trie.trie() self.dict.readDict("../dict.txt") self.b = board.board(5, dict)
optsDict["nodeMarginal"] = 1 #if true, we print out heat maps elif opt[0] == '-f': #if marginal is < 0, we flip the sign optsDict["flipSign"] = 1 elif opt[0] == '-m': #MLE optsDict["MLE"] = 1 elif opt[0] == '-s': #source norm optsDict["sourceNorm"] = 1 elif opt[0] == '-t': #target norm optsDict["targetNorm"] = 1 elif opt[0] == '-x': #only write out marginals for non-lexical rules in source optsDict["onlyXX"] = 1 params_fh = open(args[0], 'rb') paramDict = cPickle.load(params_fh) #key is 'LHS ||| src RHS' grammar_rules = [rule for rule in paramDict.keys() if rule != "Pi"] #Pi contains the start of sentence params grammarTrie = trie(grammar_rules) rank = int(args[1]) inputFile = open(args[2], 'r').readlines() numProcesses = int(args[3]) outDir = args[4] if not os.path.exists(outDir): os.makedirs(outDir) ''' declaration of list that maintains which sentences have failed across all processes ''' def init(fs, fls): global failed_sentences, flipped_sentences failed_sentences = fs flipped_sentences = fls
def amma_api_initiate_trie(): return trie.trie("input.txt") # To load trie from scheme file. clean code. tested
def load_model(file_name): obj_file = open(file_name, 'rb') tree = trie() tree = pickle.load(obj_file) return tree
sys.stderr.write( "Warning! Ignoring log_score ('-l') option, since no_cca flag ('-c') is on\n" ) param_filename = args[0] output_dir = args[1] num_process = int(args[2]) param_fh = open(param_filename, 'rb') model = cPickle.load(param_fh) extractor = cPickle.load(param_fh) param_fh.close() phrase_pairs = ["[X] ||| " + pair for pair in model.get_tokens()] phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [1] [2]") phrase_pairs.append("[X] ||| [X,1] [X,2] ||| [2] [1]") #dev_grammars=args[3] grammar_trie = trie(phrase_pairs) print "Data structures from training stage loaded" if discretize != "": #compute relevant statistics for discretization compute_feature_thresholds(model, discretize) ''' declaration of list that maintains which sentences have failed across all processes ''' def init(fs): global failed_sentences failed_sentences = fs def main(): failed_sentences = mp.Manager().list()
# # Python 2.6 import trie #wordsFilename = "top100k.txt" #wordsFilename = "top10k.txt" #wordsFilename = "top1k.txt" wordsFilename = "top100.txt" #wordsFilename = "top20.txt" #wordsFilename = "test.txt" tagsFilename = "hashtags_dev.txt" tagsFilename = "skip_hashtags-dev.txt" t = trie.trie() t.insert_file(wordsFilename) for line in open(tagsFilename): hashtag = line[1:] print "----------------------" + hashtag part = "" index = 0 while index < len(hashtag): part = part + hashtag[index] if t.is_word(part): print part part = "" index += 1 print "----------------------"
# Also, I won't be using many libraries and want to keep it that way unless absolutely necessary import pickle import re import numpy as np import sys from trie import trie # Building the player model player = trie() with open('Resources/booklist.txt') as booklist: books = booklist.readlines() for book in books: fil = open("Resources/" + book[:-1]) text = fil.read() words = re.split(r'\W+', text) words = np.unique(np.array(words)) n_block = 30 done = "[" rem = (n_block) * "~" block = int(len(words) / n_block) for i, word in enumerate(words): print( f"\rReading from {book[:-1]} : {done}{rem}] : {i*100/(len(words)):0.2f}% ", end="") if (i % block == 0): # Progress Bar done += "="
# # Python 2.6 import trie #wordsFilename = "top100k.txt" #wordsFilename = "top10k.txt" #wordsFilename = "top1k.txt" wordsFilename = "top100.txt" #wordsFilename = "top20.txt" #wordsFilename = "test.txt" tagsFilename="hashtags_dev.txt" tagsFilename="skip_hashtags-dev.txt" t = trie.trie() t.insert_file(wordsFilename) for line in open(tagsFilename): hashtag = line[1:] print "----------------------" + hashtag part = "" index=0 while index < len(hashtag) : part = part + hashtag[index] if t.is_word(part) : print part part="" index+=1 print "----------------------"
def generate_adjacent_mers(sequence, max_hamming_distance): alphabet = 'AGCT' t = trie.trie() hamming_ball(sequence, max_hamming_distance, alphabet, t) return t