def load_dicts(self): """ For better runtime speed multiple trees and dictionaries are precomputed, mapping between: - imagenet_labels: Char to labels and labels to idx (pygtrie). - imagenet_classes: Idx to label (dict). - wordnet_labels: Char to hyponym and hyponym to idx (pygtrie). """ import json path = get_project_root() self.imagenet_labels: pygtrie = pygtrie.CharTrie( json.load( open( path / "assets/imagenet_classes/trie_char_to_name_to_idx.json", "rb"), )) self.imagenet_classes: dict = json.load( open(path / "assets/imagenet_classes/dic_idx_to_label.json", "rb"), object_hook=key_to_int, ) self.wordnet_labels: pygtrie = pygtrie.CharTrie( json.load( open(path / "assets/imagenet_classes/hyponym_imagenet.json", "rb")))
def __init__(self): self.config = Config() self.slack_client = SlackClient(self.config.bot_token) # Create tries if necessary # This is used for auto-complete if Bot.company_trie is None: Bot.company_trie = pygtrie.CharTrie() Bot.symbol_trie = pygtrie.CharTrie() full_path = os.path.realpath(__file__) path = os.path.dirname(full_path) with open(path + '/resources/nasdaq.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: cname = row['Name'].strip().upper() symbol = row['Symbol'].strip().upper() Bot.company_trie[cname] = symbol Bot.symbol_trie[symbol] = cname with open(path + '/resources/nyse.csv') as csvfile: reader = csv.DictReader(csvfile) for row in reader: cname = row['Name'].strip().upper() symbol = row['Symbol'].strip().upper() Bot.company_trie[cname] = symbol Bot.symbol_trie[symbol] = cname
def create_blacklist_ip_trie(): print('* Creating Blacklist IP Trie.') try: print('* Last update: {}'.format(time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(os.path.getmtime(BLACKLISTED_IP_TRIE_JOBLIB))))) except: pass blip = None blacklisted_ip_trie = trie.CharTrie() z = download_zipfile() if(z != None): try: with z.open(z.filelist[0].filename) as myfile: blip = myfile.read() blip = blip.decode() blip = blip.split('\n') for row in blip: re_obj = re.search(IP_PATTERN, row) if re_obj is not None: matched_ip = row[re_obj.span()[0]:re_obj.span()[1]].split('\t')[0] blacklisted_ip_trie[matched_ip] = True dump(blacklisted_ip_trie, BLACKLISTED_IP_TRIE_JOBLIB) print('* Blacklist IP Trie created.') except Exception as e: print('* ERROR IN CREATING BLACKLIST IP TRIE : ', e) sys.exit(1) else: print('* UNABLE TO UPDATE Blacklist IP Trie!!') sys.exit(1)
def load_token_dict_as_trie(token_dict='token_dict.txt'): with open(token_dict, 'r', encoding='UTF-8') as f: token_set = ast.literal_eval(f.read()) t = trie.CharTrie() for token in token_set: t[token] = True return t
def test_traverse_compressing(self): t = pygtrie.CharTrie() t.update({'aaa': 1, 'aab': 2, 'aac': 3, 'bb': 4}) def make(path_conv, path, children, value=self._SENTINEL): children = sorted(children) if value is self._SENTINEL and len(children) == 1: # There is only one prefix. return children[0] else: return self._TestNode(path_conv(path), children, value) r = t.traverse(make) # Result: # <> # aa # aaa:1 # aab:2 # aac:3 # bb:4 self.assertNode(r, '', 2) # For some reason pylint thinks a_node et al. are strings. # pylint: disable=no-member aa_node = self.assertNode(r.children[0], 'aa', 3) self.assertNode(aa_node.children[0], 'aaa', 0, 1) self.assertNode(aa_node.children[1], 'aab', 0, 2) self.assertNode(aa_node.children[2], 'aac', 0, 3) self.assertNode(r.children[1], 'bb', 0, 4)
def part_two(): # need just two strings that differ by a single letter import pygtrie as trie # Build a prefix tree out of the words t = trie.CharTrie() ids = list(read_lines()) # Populate trie, values are random (True) for word in ids: t[word] = True def trie_walk(path_conv, chars, children, whatsthis=None): path = path_conv(chars) # Got two subtrees here, where do they differ? words = list(t.keys(path)) if len(words) == 2 and differ_barely(*words): # Found them! Propagate them back up return words if len(words) < 1: return if len(words) > 2: # we need to go deeper, this results in recursive call result = list(children) # Either there is one result, or none # Filter out the result and bubble back up result = list(filter(lambda x: x, result)) result = result[0] if result else None return result res = t.traverse(trie_walk) # Now just print the common letters for a, b in zip(*res): if a == b: print(a, end='') print()
def _hack_build_pygtrie_prefix_freq(items): """ Builds the trie, and then modifies the internal nodes so the sentinal values become the frequenceies we need. """ import pygtrie from collections import deque # construct tree self = pygtrie.CharTrie(zip(items, [0] * len(items))) # Hack into the internal structure and insert frequencies at each node def _iternodes(self): """ Generates all nodes in the trie """ stack = deque([[self._root]]) while stack: for node in stack.pop(): yield node stack.append(node.children.values()) for node in _iternodes(self): if node is not self._root: # dont do this to the root node.value = 0 # For each item trace its path and increment frequencies for item in items: final_node, trace = self._get_node(item) for key, node in trace[1:]: node.value += 1 return self
def get_clue_trie(clues, max_difficulty=6, num_clues=50): print("Creating clue set") clue_trie = pygtrie.CharTrie() for clue in clues: try: diff = int(clue["difficulty"]) q = clue["question"] new_clue = {"difficulty": diff, "hint": q} except: print("clue could not be parsed") continue # validate the question question = clue["question"] question.lower() if question == "": continue if "across" in question or "down" in question or \ "Across" in question or "Down" in question or \ "this puzzle" in question: continue # validate the answer if len(clue["answer"]) < 2: print("invalid one char answer: " + clue["answer"]) continue if new_clue["difficulty"] >= max_difficulty: clue_trie[clue["answer"]] = new_clue # num_clues -= 1 # if num_clues <= 0: # break return clue_trie
def get_pygtrie() -> pygtrie.CharTrie: global _pygtrie if not _pygtrie: _pygtrie = pygtrie.CharTrie() for word in words(): _pygtrie[word] = True return _pygtrie
def test_traverse(self): t = pygtrie.CharTrie() t.update({'aaa': 1, 'aab': 2, 'aac': 3, 'bb': 4}) r = t.traverse(self._make_test_node) # Result: # <> # a # aa # aaa:1 # aab:2 # aac:3 # b # bb:4 self.assertNode(r, '', 2) # For some reason pylint thinks a_node et al. are strings. # pylint: disable=no-member a_node = self.assertNode(r.children[0], 'a', 1) aa_node = self.assertNode(a_node.children[0], 'aa', 3) self.assertNode(aa_node.children[0], 'aaa', 0, 1) self.assertNode(aa_node.children[2], 'aac', 0, 3) b_node = self.assertNode(r.children[1], 'b', 1) self.assertNode(b_node.children[0], 'bb', 0, 4)
def set_value(partition_key, sort_key): lookup_key = partition_key + ":" + sort_key print("{} Saving {} to {}".format(self_server, request.data, lookup_key)) data[lookup_key] = request.data.decode('utf-8') if lookup_key in indexed: return make_response(str(response.status_code), response.status_code) indexed[lookup_key] = True sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert(partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert(sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) sql_index[sort_key] = lookup_key return make_response('', 202)
def _create_conversion_trie(strict): """ Create the trie for betacode conversion. Args: text: The beta code text to convert. All of this text must be betacode. strict: Flag to allow for flexible diacritic order on input. Returns: The trie for conversion. """ t = pygtrie.CharTrie() for beta, uni in _map.BETACODE_MAP.items(): if strict: t[beta] = uni else: # The order of accents is very strict and weak. Allow for many orders of # accents between asterisk and letter or after letter. This does not # introduce ambiguity since each betacode token only has one letter and # either starts with a asterisk or a letter. diacritics = beta[1:] perms = itertools.permutations(diacritics) for perm in perms: perm_str = beta[0] + ''.join(perm) t[perm_str.lower()] = uni t[perm_str.upper()] = uni return t
def initialize_trie(self): self.trie = pygtrie.CharTrie() with self.ix.reader() as reader: for doc in reader.iter_docs(): self.trie[list( WhooshConstants.normalized_analyzer( doc[1]['name']))[0].text] = doc[1]['name']
def test_traverse_singleton_tree(self): t = pygtrie.CharTrie() t.update({'a': 10}) r = t.traverse(self._make_test_node) self.assertNode(r, '', 1) self.assertNode(r.children[0], 'a', 0, 10)
def precomp(labe_probs, neighbs, T): """ labe_probs - T x nlabes, normalized along dim 1 neighbs - nesz length list of tags """ trie = pygtrie.CharTrie() labe2idx = ASCII2IDX # put all subsequences # [trie.__setitem__(''.join(ne[i:j]), True) # for ne in neighbs for i in range(len(ne)) for j in range(i+1, min(len(ne), i+T)+1)] [trie.__setitem__(ne[i:j], True) for ne in neighbs for i in range(len(ne)) for j in range(i+1, min(len(ne), i+T)+1)] # now we can get cost for every start position. # Note we'll have trie[pfx][t] = cost of that prefix STARTING AT t prev, start = None, None for key in trie.iterkeys(): lastlabe = key[-1] if len(key) == 1: cost = (1-labe_probs[:, labe2idx[lastlabe]]) # the "cost" start = 0 elif prev is None: # non 1-length restart; figure out where we are prev = trie[key[:-1]] start = len(key) - 1 cost = prev[:-1] + (1-labe_probs[start:, labe2idx[lastlabe]]) else: # print(key, start) # print(prev) cost = prev[:-1] + (1-labe_probs[start:, labe2idx[lastlabe]]) trie[key] = cost if not trie.has_subtrie(key): # the terminal subsequence prev, start = None, None else: prev = cost start += 1 return trie
def set_value(partition_key, sort_key): lookup_key = partition_key + ":" + sort_key machine_index = hashes["hashes"].get_machine(partition_key) response = requests.post("http://{}/set/{}/{}".format( servers[machine_index], partition_key, sort_key), data=request.data) if lookup_key in indexed: return make_response(str(response.status_code), response.status_code) indexed[lookup_key] = True sort_index[partition_key + ":" + sort_key] = sort_key if sort_key not in sort_index: sort_index[sort_key] = pygtrie.CharTrie() sort_index[sort_key][partition_key] = partition_key + ":" + sort_key if partition_key not in between_index: between_index[partition_key] = Tree("", None, None) if partition_key not in partition_trees: partition_tree = both_between_index.insert(partition_key, Tree("", None, None)) partition_trees[partition_key] = partition_tree between_index[partition_key].insert(sort_key, partition_key, partition_key + ":" + sort_key) partition_trees[partition_key].partition_tree.insert( sort_key, partition_key, partition_key + ":" + sort_key) return make_response(str(response.status_code), response.status_code)
def _optimize_scrabble_words(self): '''Initializes a Trie of all possible Scrabble words for optimized lookups.''' print('Optimizing Word Dictionary...') trie = pygtrie.CharTrie() for word in WORDS: trie[word] = True print('Done Optimizing.') return trie
def load_dict(name): t = pygtrie.CharTrie() with open(path.join(here, 'dict', 'zyenpheng.dict.' + name + '.yaml')) as f: for line in f: k, v = line.rstrip().split('\t') t[k] = v return t
def __init__(self, source=None): """Create an AutocompleteProvider. If source is provided, it is a file-like object used to train. """ self.trie = pygtrie.CharTrie() if source: self.train(source.read())
def test_char(): print('CharTrie Test') print('-------------') import pygtrie trie = pygtrie.CharTrie() trie.enable_sorting() for word in data.split(): trie[word.lower()] = True print('K : ', ', '.join(trie.keys()))
def _update(self): data = loadjson(MEMBER) self._rosters = {} for group in data: self._rosters[group] = pygtrie.CharTrie() for user in data[group]: for name in data[group][user]: if name.lower() not in self._rosters[group]: self._rosters[group][name.lower()] = user
def main(): os.makedirs(args.wiki_preprocess, exist_ok=True) dummy = DummyVocab() datasets = read_datasets(args.dataroot, dummy, dummy, ctx_window=999) summarize_datasets(datasets) prefix_vocab = pygtrie.CharTrie() suffix_vocab = pygtrie.CharTrie() for _, dataset in tqdm( datasets.items(), dynamic_ncols=True, desc="Build Tries"): for mentions in dataset: for mention in mentions: mention_text = RE_PUNCT.sub(' ', mention.text.lower()).strip() prefix_vocab[mention_text + " "] = True suffix_vocab[mention_text[::-1] + " "] = True print(f'Forward trie size: {len(prefix_vocab)}') print(f'Backward trie size: {len(suffix_vocab)}') paths = list(glob.glob(os.path.join(args.wiki_dump, '*.xml-*'))) paths = sorted( paths, key=lambda p: int(os.path.basename(p).split('-')[4][11:-4])) params = [(path, prefix_vocab, suffix_vocab) for path in paths] prefix_count = dict() suffix_count = dict() total_pages = 0 with mp.Pool(processes=args.cpu) as pool, \ tqdm(total=len(paths), dynamic_ncols=True) as pbar: for i, res in enumerate(pool.imap_unordered(process_stream, params)): prefix_count_, suffix_count_, page_counter = res total_pages += page_counter update_counts(prefix_count, prefix_count_) update_counts(suffix_count, suffix_count_) pbar.write( f'pages: {total_pages}, ' f'# forward: {len(prefix_count)}/{len(prefix_vocab)}, ' f'# backward: {len(suffix_count)}/{len(suffix_vocab)}') pbar.update() if i % 10 == 0: dump(prefix_count, suffix_count) del prefix_count_, suffix_count_ dump(prefix_count, suffix_count)
def generate_trie(df, f): trie = pygtrie.CharTrie(df.to_dict('index')) if not os.path.exists(dict_path): os.makedirs(dict_path) file = open(f, 'wb') pickle.dump(trie, file) file.close() return trie
def __init__(self, file_path): self.trie = pygtrie.CharTrie() print('Building trie...') num_words = 0 with open(file_path) as f: for word in f: self.trie[CleanWord(word)] = True num_words += 1 print('Trie built. Words found:', num_words)
def store(self, partition_key, sort_key, item): lookup_key = partition_key + ":" + sort_key self.db[lookup_key] = item self.sort_index[lookup_key] = sort_key if sort_key not in self.sort_index: self.sort_index[sort_key] = pygtrie.CharTrie() self.sort_index[sort_key][partition_key] = lookup_key if partition_key not in self.between_index: self.between_index[partition_key] = Tree("", None, None) self.between_index[partition_key].insert(sort_key, partition_key, lookup_key)
def test_empty_tokens(self): tokens = [] pre_process = lambda x: x pre_filter = lambda x: x post_filter = lambda x: isinstance(x, basestring) ngrams = 1 joiner = ' ' n_trie = trie.CharTrie() c = Core() r = c._extract_using_dictionary(tokens, pre_process, n_trie, pre_filter, post_filter, ngrams, joiner) self.assertEqual(r, None)
def solve_word_search(puzzle, words): copy = [[['0', 'N'] for i in range(len(puzzle[0]))] for j in range(len(puzzle))] trie = pygtrie.CharTrie() for w in words: trie[w] = True for y in range(len(puzzle)): for x in range(len(puzzle[0])): if trie.has_subtrie(puzzle[y][x]): out = search_word(trie, copy, puzzle, x, y, "", "n") return copy
def __init__(self, suffix_file=None, invert=False): """ Constructor. Args: suffix_file (str): the path to the file containing the suffixes to filter. invert (bool): whether to invert matches. """ self.suffix_set = pygtrie.CharTrie() self.create_filter_set(suffix_file) self.invert = invert
def compute_prefix_embeddings(words, emb_info, dtype='float'): """ - Words in the pre-trained embeddings: discarded in the output - Words not in the pre-trained embeddings: the embeddings are computed as mean of the embeddings of words which share prefixes with the input words. - Words with no matching prefix: discarded in the output """ emb_words, emb = emb_info emb_w2i = build_w2i(emb_words) emb_words_trie = pygtrie.CharTrie() for w in emb_words: emb_words_trie[w] = 1 output_pairs = [] for w in words: if w not in emb_w2i: # print("===" + w) ## handle emb words for which the input word is a prefix longer_words = [] if emb_words_trie.has_key(w): longer_words = emb_words_trie.keys(w) ## handle emb words which are prefixes of the input word is a prefix shorter_words = [x[0] for x in emb_words_trie.prefixes(w)] # ## all matched words # matched_words=longer_words+shorter_words ## longest short word and shortest long word matched_words = [] if len(longer_words) > 0: matched_words.append(min(longer_words, key=lambda x: len(x))) if len(shorter_words) > 0: matched_words.append(max(shorter_words, key=lambda x: len(x))) # print(matched_words) ## embedding of word is mean of matched words embeddings if len(matched_words) > 0: w_emb = np.mean(np.array( [emb[emb_w2i[mw]] for mw in matched_words], dtype=dtype), axis=0) output_pairs.append((w, w_emb)) output_words = [x[0] for x in output_pairs] output_emb = np.array([x[1] for x in output_pairs], dtype=dtype) return (output_words, output_emb)
class LocationTrie: __trie = pygtrie.CharTrie() def __init__(self): self.__trie.enable_sorting() def add_location(self, location): self.__trie[location] = True def delete_location(self, location): self.__trie.pop(location) def get_location_list(self, prefix): return self.__trie.keys(prefix)