def replaceWords(self, dictionary, sentence): ''' :param dictionary: :return: ''' lis = sentence.split(" ") trie = Trie() for word in dictionary: trie.insert(word) # O(len(sentence)) time | O(26 * len(dictionary)) + O(len(sentence)) space. for idx, word in enumerate(sentence.split(" ")): root = trie.root result = [] for w in word: if root and root.children[trie.getIndex(w)]: if root.children[trie.getIndex(w)].isEndNode: result.append(w) break root = root.children[trie.getIndex(w)] else: root = None result.append(w) lis[idx] = "".join(result) return " ".join(lis)
def longestWord(self, words): ''' :param words: :return: ''' ''' populate the trie. len(words) * max(len(word)) Use dfs to find the deepest branch in the trie O(len(words) ) time. as max(len(word)) is bounded, O(len(words)) time | O(len(words)) space''' trie = Trie() for word in words: trie.insert(word) root = trie.root max_prefix = "" stack = [[root, ""]] while stack: curr_node, prefix = stack.pop() if (len(max_prefix) < len(prefix)): max_prefix = prefix for i in range(25, -1, -1): if curr_node.children[i]: if curr_node.children[i].isEndNode: stack.append( [curr_node.children[i], prefix + chr(97 + i)]) return max_prefix
class TestTrie(unittest.TestCase): def setUp(self): self.trie = Trie() self.case = [ "A", "a", "aa", "aal", "aalii", "aam", "Aani", "aardvark", "aardwolf", "Aaron", "Aaronic", "Aaronite", "Aaronitic", "Aaru", "Ab", "Ababdeh", "Ababua", "abac", "abacay", "abacinate" ] def testInsertSearch(self): words = set(self.case) length = len(words) / 2 setA = set() for i in range(length / 2): setA.add(words.pop()) setB = words for word in setA: self.trie.insert(word) for word in setA: self.assertTrue(self.trie.search(word)) for word in setB: self.assertFalse(self.trie.search(word)) def testStartWith(self): prefixes = set(["A", "a", "aa", "aal", "Aaron", "Ab", "aba", "abac"]) others = [ "abaciscus", "abacist", "aback", "abactinal", "Abe", "abaction" ] for word in set(self.case) - prefixes: self.trie.insert(word) for prefix in prefixes: self.assertTrue(self.trie.startsWith(prefix)) for word in others: self.assertFalse(self.trie.startsWith(word))
def suggestedProducts(self, products, searchWord): ''' :param products: :param searchWord: :return: ''' trie = Trie() for word in products: trie.insert(word) prefix = "" result = [] p_crawl = trie.root prefix = "" # O ( len(searchWord)^2 * len(products) * 26 * len(max(products)) ) time # O( len(max(products)) * 26 * len(products) ) space for s in searchWord: prefix += s curr_result = [] p_crawl = p_crawl.children[trie.getIndex(s)] if p_crawl: curr_result = self.dfs(p_crawl, prefix) result.append(curr_result) return result
def test_insert_trie_multi(self): trie = Trie() trie.insert([1, 1, 1]) trie.insert([1, 1, 2]) trie.insert([1, 1, 3]) trie.insert([1, 2, 1]) trie.insert([1, 2, 2]) trie.insert([1, 2, 3]) trie.insert([1, 3, 4]) trie.insert([5, 1, 1]) trie.insert([5, 3, 1]) collection = trie.collect([]) print(collection) self.assertEqual( [[1, 1, 1], [1, 1, 2], [1, 1, 3], [1, 2, 1], [1, 2, 2], [1, 2, 3], [1, 3, 4], [5, 1, 1], [5, 3, 1]], collection) collection11 = trie.collect([1, 1]) print(collection11) collection5 = trie.collect([5]) self.assertEqual([[1, 1, 1], [1, 1, 2], [1, 1, 3]], collection11) self.assertEqual([[5, 1, 1], [5, 3, 1]], collection5) print(collection5)
def autocomplete(prefix: str, possible_queries: List[str]) -> List[str]: # Add all query strings to the Trie trie = Trie() for word in possible_queries: trie.insert(word) # Get the nested dictionary for input prefix prefix_dict = trie.find(prefix) # Get all words from this dictionary return complete_words(prefix, prefix_dict)
def test_insert_trie_one(self): trie = Trie() arr1 = [1, 2, 3] trie.insert(arr1) collection = trie.collect([]) print(collection) self.assertEqual([[1, 2, 3]], collection)
def load_trie(): trie = Trie() count = 0 with open("成语俗语.txt", encoding='utf-8') as f: for line in f: count += 1 line = line.strip() trie.insert(line) print("word num:", count) return trie
def corpus2table(data_path, table_path=None, lang=None): trie = Trie() with open(data_path, 'r', encoding='utf-8') as inp: for line in inp: words = word_tokenize(line) for w in words: w = non_word_pattern.sub('', w) if not w: continue trie.insert(f'{w.lower()}#') prefix_suffix_tree = trie.get_prefix_suffix_tree() print('Tree constructed') prefixes = sorted(prefix_suffix_tree.keys()) suffix_counts = Counter() for v in prefix_suffix_tree.values(): for k, count in v.items(): suffix_counts[k] += count # Take N most common suffixes sorted_counts = suffix_counts.most_common(300) suffixes = [el[0] for el in sorted_counts] freqs = [el[1] for el in sorted_counts] d = pd.DataFrame(index = prefixes, columns = suffixes, dtype = int).fillna(0) for prefix, suffix_counts_for_prefix in prefix_suffix_tree.items(): print(prefix) for suffix, count in suffix_counts_for_prefix.items(): if suffix in d.columns: d.loc[prefix,suffix] = count print('Dataframe constructed') entropies = d.apply(entropy) if lang is not None: # Regress entropies on log frequencies plt.figure(figsize=(16,10)) plt.scatter(np.log(freqs), entropies, marker = 'o') plt.savefig(f'/home/macleginn/Analyses/bible-tables/img/entropies_log_freqs_{lang}.png') cutoff = np.quantile(entropies, 0.9) d = d.loc[:,entropies > cutoff] print('Columns selected') if table_path is not None: d.to_csv(table_path) return d
def replaceWords(self, dict, sentence): t = Trie() words = sentence.split() for key in dict: t.insert(key) new_sentence = [] for word in words: d = t.get_first_word_in_item(word) if d: new_sentence.append(d) else: new_sentence.append(word) return " ".join(new_sentence)
def make_dot_file(ways, distances, file_name=''): unique_ways = [list(x) for x in set(tuple(x) for x in ways)] t = Trie(distances) for way in unique_ways: t.insert(way) trie = t.query(["Казань"]) file_string = '' file_string += (f'digraph {file_name}' + '{\n') for edge in trie: file_string += (f'\t{edge}\n') file_string += ('}') file_string = file_string.replace('.', '') with open(f'data/{file_name}', 'w') as output: output.write(file_string)
def is_formation_possible(dictionary, word): trie = Trie() for w in dictionary: trie.insert(w) current_node = trie.root for i in range(len(word)): index = trie.get_index(word[i]) if not current_node.children[index]: return False current_node = current_node.children[index] if current_node.is_end_word: if trie.search(word[i + 1:]): return True return False
def word_to_subword(infile, outfile): preTrie = Trie() with open('../data/pre.txt', 'r') as f: for line in f: line = line.strip() preTrie.insert(line) suffTrie = Trie() with open('../data/suff.txt', 'r') as f: for line in f: line = line.strip() suffTrie.insert(line[::-1]) print(preTrie.startsWith('including')) with open(outfile, 'w') as fw: with open(infile, 'r') as fr: for line in fr: for word in line.strip().split(): new_line = [] prefix = preTrie.startsWith(word) sufffix = suffTrie.startsWith(word[::-1]) if sufffix: sufffix = sufffix[::-1] if prefix: if sufffix: if len(prefix) + len(sufffix) >= len(word): new_line.extend( [prefix, word[len(prefix):], '']) else: new_line.extend([ prefix, word[len(prefix):-len(sufffix)], sufffix ]) else: new_line.extend([prefix, word[len(prefix):], '']) else: if sufffix: new_line.extend( ['', word[:-len(sufffix)], sufffix]) else: new_line.extend(['', '', '']) new_line = [ item if item != '' else '#' for item in new_line ] fw.write(word + '\t' + ' '.join(new_line) + '\n') print('word to subword Successfully')
def generate_kb2id(): id_dict = dict() id_dict['PAD'] = 0 kb_word_id = 1 trie = Trie() with open(args.kb_path, 'r', encoding='utf-8') as f: for word in f: word = word.strip() if word not in id_dict.keys(): trie.insert(word) id_dict[word] = kb_word_id kb_word_id = kb_word_id + 1 with open(args.trie_path, 'wb') as f: pkl.dump(trie, f) with open(args.kb2id_path, 'wb') as f: pkl.dump(id_dict, f)
def corpora2dict(path): data_dict = { 'log_frequency': [], 'entropy': [], 'doculect': [] } trie = Trie() doculect = path.split('/')[-1].split('.')[0] print(doculect) with open(path, 'r', encoding='utf-8') as inp: for line in inp: words = word_tokenize(line) for w in words: w = non_word_pattern.sub('', w) if not w: continue trie.insert(f'{w.lower()}#') prefix_suffix_tree = trie.get_prefix_suffix_tree() prefixes = sorted(prefix_suffix_tree.keys()) suffix_counts = Counter() for v in prefix_suffix_tree.values(): for k, count in v.items(): suffix_counts[k] += count # Take N most common suffixes sorted_counts = suffix_counts.most_common(300) suffixes = [el[0] for el in sorted_counts] freqs = [el[1] for el in sorted_counts] # Construct an intermediate data frame to compute entropies d = pd.DataFrame(index = prefixes, columns = suffixes, dtype = int).fillna(0) for prefix, suffix_counts_for_prefix in prefix_suffix_tree.items(): for suffix, count in suffix_counts_for_prefix.items(): if suffix in d.columns: d.loc[prefix,suffix] = count entropies = d.apply(entropy) for f, e in zip(np.log(freqs), entropies): data_dict['doculect'].append(doculect) data_dict['log_frequency'].append(f) data_dict['entropy'].append(e) return data_dict
def test_insert_string(self): trie = Trie() trie.insert('ant') trie.insert('bar') trie.insert('bat') trie.insert('car') trie.insert('cat') trie.insert('cry') all_words = trie.collect_string('') print(f'all words: {all_words}') self.assertEqual(['ant', 'bar', 'bat', 'car', 'cat', 'cry'], all_words) a_words = trie.collect_string('a') print(f'a words: {a_words}') self.assertEqual(['ant'], a_words) b_words = trie.collect_string('b') print(f'b words: {b_words}') self.assertEqual(['bar', 'bat'], b_words) c_words = trie.collect_string('c') print(f'c words: {c_words}') self.assertEqual(['car', 'cat', 'cry'], c_words) ca_words = trie.collect_string('ca') print(f'ca words: {ca_words}') self.assertEqual(['car', 'cat'], ca_words)
def findLongestWord(self): word = self.word trie = Trie() queue = deque() #insert key to tree and also mark all the prefix with tuple format for key in word: # from longest to shortest prefixes = trie.getAllPrefix(key) for pf in prefixes: queue.append((key, key[len(pf):])) trie.insert(key) # get the longest word form the provided dictionary longest_word = ['',''] flag = 2 # mark get the first two longest dic = {} # mark visited word while queue: key,suffix = queue.popleft() if key not in dic and suffix in trie: dic[key] = True if len(key) > len(longest_word[0]): longest_word[1] = longest_word[0] longest_word[0] = key elif len(key) > len(longest_word[1]): longest_word[1] = key else: prefixes = trie.getAllPrefix(suffix) for pf in prefixes: queue.append((key, suffix[len(pf):])) #print result print "longest_word 1 are ", longest_word[0], ', length is ',len(longest_word[0]) print "longest_word 2 are ", longest_word[1], ', length is ',len(longest_word[1]) print "total words can be combined by other words are", len(dic) return
class PrefixMapSum: def __init__(self): self._trie = Trie() self.values = defaultdict(int) def insert(self, key: str, value: int): self._trie.insert(key) self.values[key] = value def sum(self, prefix: sum): # Get all possible words with prefix words = self.complete_words(prefix, self._trie.find(prefix)) # Sum values from values dictionary for all possible words return sum(self.values[word] for word in words) def complete_words(self, prefix, prefix_dict: dict): words = [] for key, next_level in prefix_dict.items(): if key == ENDS_HERE: words.append(prefix) else: words.extend(self.complete_words(prefix + key, next_level)) return words
elif len(tag) == 2: temp_tag = tag[0] + " " + tag[1] elif len(tag) == 3: temp_tag = tag[0] + " " + tag[1] + " " + tag[2] elif len(tag) == 4: temp_tag = tag[0] + " " + tag[1] + " " + tag[ 2] + " " + tag[3] elif len(tag) == 5: temp_tag = tag[0] + " " + tag[1] + " " + tag[ 2] + " " + tag[3] + " " + tag[4] elif len(tag) == 6: temp_tag = tag[0] + " " + tag[1] + " " + tag[ 2] + " " + tag[3] + " " + tag[4] + " " + tag[5] if (len(word) == 1): Lexi.insert(word[0], temp_tag) elif (len(word) == 2): Lexi.insert(word[0] + " " + word[1], temp_tag) elif (len(word) == 3): Lexi.insert(word[0] + " " + word[1] + " " + word[2], temp_tag) ###### Main Menu #################################################### ##################################################################### Exit = False while (not Exit): Key = raw_input( "=====:: Main Menu ::===============\n\n 1.Parse\n 2.Edit Gammer Or Lexicon\n 3.Represent Data Or Grammer\n 4.Exit\n\n :: " ) if (int(Key) == 1):
class Extractor(object): def __init__(self, rfpath, max_len=4): self.prefixTree = Trie() self.suffixTree = Trie(direction='suffix') self.vocabulary = [] self.len_dict = dict() # 想要计n个字的词必须用n+1-gram self.max_len = max_len + 1 text = Cleaner.preprocess_text(rfpath) self.buildTreesAndDics(text) self.prefixTree.set_entropy() self.suffixTree.set_entropy() self.words = dict() def buildTreesAndDics(self, text): tic = time() pbar = tqdm(range(self.max_len)) for i in pbar: pbar.set_description("buildTreesAndDics, %d-gram \n" % (i + 1)) n_gram_list = sum(map(lambda x: Cleaner.n_gram(x, i + 1), text), []) self.len_dict[i + 1] = len(n_gram_list) if i >= 1: self.vocabulary.extend(list(set(n_gram_list))) for word in n_gram_list: self.prefixTree.insert(word, i + 1) self.suffixTree.insert(word, i + 1) print("build tree done! %.2fs" % (time() - tic)) def score(self, candidate): ''' 淘宝 h_r_l:宝的左信息熵 h_l_r:淘的右信息熵 ''' children = set() h_l, count = calculate_entropy(candidate, self.prefixTree, return_count=True) h_r = calculate_entropy(candidate, self.suffixTree, return_count=False) max_score = 0 for seg_index in range(1, len(candidate)): pmi = cal_pmi(candidate, self.len_dict, seg_index, self.suffixTree) left_candidate = candidate[:seg_index] right_candidate = candidate[seg_index:] # 去除重叠的词,似乎太粗暴,过于倾向于长词,比如‘牛逼’与‘牛逼牛逼牛逼’会选择后者 if left_candidate in self.words: children.add(left_candidate) if right_candidate in self.words: children.add(right_candidate) h_r_l = calculate_entropy(right_candidate, self.prefixTree, return_count=False) h_l_r = calculate_entropy(left_candidate, self.suffixTree, return_count=False) score = min(h_l_r, h_r_l) if score > max_score: max_score = score if h_l == 0 or h_r == 0: return count, 0, 0 for child in children: del self.words[child] max_score = pmi + min(h_l, h_r) - max_score return count, max_score, max_score * count def extract_words(self, thresh=None): # calculate PMI and freq remove dict words if thresh: for word in tqdm(self.vocabulary): count, score, final = self.score(word) if score > thresh: self.words[word] = { "candidate": word, "count": count, "score": score, "final": final } words = pd.DataFrame.from_dict(list(self.words.values())) else: words = pd.DataFrame(self.vocabulary, columns=['candidate']) words[['count', 'score', 'final']] = words.apply( lambda x: pd.Series(self.score(x['candidate'])), axis=1) if words.shape[0]: words = words.sort_values("final", ascending=False).reset_index(drop=True) return words
''' You're given a dictionary of strings, and a key. Check if the key is composed of an arbitrary number of concatenations of strings from the dictionary. For example: dictionary: "world", "hello", "super", "hell" key: "helloworld" --> return true key: "superman" --> return false key: "hellohello" --> return true ''' from Trie import Trie words = ["world", "hello", "super", "hell" ] trie = Trie() for word in words: trie.insert(word, 1) def search(root, key, new_start = False): if root == None: return False if new_start: if not root.children.get(key[0], None): return False if(len(key) == 0): if root.data == 1: return True return False #Since we still have characters left, we search for the child node using the next
'''Made by Jackson Bremen ||| Written Summer 2018, Refactored Winter 2020 Trie Datastructure used from github below, additional functionality added https://www.wordplays.com/boggle has line crossing be legal ''' import readline from Trie import Trie dictionary = Trie() with open('allScrabbleWords.txt', 'r') as file: for i in file.read().split(): dictionary.insert(i) board = [] with open('boggleBoard.txt', 'r') as file: for line in file.readlines(): board.append([]) for char in line.split(): board[-1].append(char.upper()) def print_board(board): for y in board: for x in y: if len(x) == 2: print(x, end=' ') else: print(x, end=' ') print()
''' You're given a dictionary of strings, and a key. Check if the key is composed of an arbitrary number of concatenations of strings from the dictionary. For example: dictionary: "world", "hello", "super", "hell" key: "helloworld" --> return true key: "superman" --> return false key: "hellohello" --> return true ''' from Trie import Trie words = ["world", "hello", "super", "hell"] trie = Trie() for word in words: trie.insert(word, 1) def search(root, key, new_start=False): if root == None: return False if new_start: if not root.children.get(key[0], None): return False if (len(key) == 0): if root.data == 1: return True return False
class preprocess: def __init__(self): self.trie = Trie() self.spam_count = 0 self.normal_count = 0 self.dump_csv = [] def is_ascii(self, s): return all(ord(c) < 128 for c in s) def remove_non_ascii(self, s): return re.sub(r'[^\x00-\x7f]', r' ', s) def get_trie(self): data = pandas.read_csv("spam_or_not_spam.csv") for index, row in data.dropna().iterrows(): # if index > 2600: # continue is_spam = int(row["label"]) seen = set() email = self.remove_non_ascii(row["email"]) for word in email.split(): if word in seen: continue seen.add(word) word = ''.join(filter(str.isalnum, word)) if word.isdigit(): continue self.trie.insert(word, is_spam) if is_spam == 1: self.spam_count += 1 else: self.normal_count += 1 def dfs(self, word="", node=None): if not node: node = self.trie.root if node.is_word: self.dump_csv.append({ "word": word, "spam": node.spam_count, "normal": node.normal_count }) for c in node.children: self.dfs(word + c, node.children[c]) def process(self): self.get_trie() self.dfs() new_df = pandas.DataFrame(self.dump_csv, columns=['word', 'spam', 'normal']) print("test: ", self.trie.search("interred").normal_count) with open("cleaned.csv", "w") as f: new_df.to_csv(f, header=True, mode='w', line_terminator="\n")
class Extractor(object): def __init__(self, rfpath=None, text=None, max_len=4): self.prefixTree = Trie() self.suffixTree = Trie(direction='suffix') self.vocabulary = [] self.len_dict = dict() # 想要计n个字的词必须用n+1-gram self.max_len = max_len + 1 if rfpath is not None: text = Cleaner.preprocess_text(rfpath) elif text is None: raise ValueError() self.buildTreesAndDics(text) self.prefixTree.set_entropy() self.suffixTree.set_entropy() self.words = dict() def buildTreesAndDics(self, text): tic = time() for i in range(self.max_len): n_gram_list = sum( map(lambda x: Cleaner.n_gram(x, i + 1), text), []) self.len_dict[i + 1] = len(n_gram_list) if i >= 1: self.vocabulary.extend(list(set(n_gram_list))) for word in n_gram_list: self.prefixTree.insert(word, i + 1) self.suffixTree.insert(word, i + 1) sys.stdout.write('build tree done %d/%d\r' % (i, self.max_len)) def score(self, candidate, cnt_thresh): ''' 淘宝 h_r_l:宝的左信息熵 h_l_r:淘的右信息熵 ''' children = set() h_l, count = calculate_entropy( candidate, self.prefixTree, return_count=True) if count < cnt_thresh: return count, None, None h_r = calculate_entropy(candidate, self.suffixTree, return_count=False) min_score = np.inf for seg_index in range(1, len(candidate)): left_candidate = candidate[:seg_index] right_candidate = candidate[seg_index:] if left_candidate in self.words: children.add(left_candidate) if right_candidate in self.words: children.add(right_candidate) h_r_l = calculate_entropy( right_candidate, self.prefixTree, return_count=False) h_l_r = calculate_entropy( left_candidate, self.suffixTree, return_count=False) pmi = cal_pmi(candidate, self.len_dict, seg_index, self.suffixTree) score = pmi - min(h_l_r, h_r_l) if score < min_score: min_score = score if h_l == 0 or h_r == 0: return count, 0, 0 min_score += min(h_l, h_r) for child in children: # 出现次数大于等于子段,选长的 if min_score > self.words[child]['score']: del self.words[child] return count, min_score, min_score * count def extract_words(self, score_thresh=4.0, cnt_thresh=20): # calculate PMI and freq remove dict words for i, word in enumerate(self.vocabulary): res = self.score(word, cnt_thresh) count, score, final = res if score is None or score < score_thresh: continue self.words[word] = {"candidate": word, "count": count, "score": score, "final": final} sys.stdout.write('extract words done %d/%d\r' %(i, len(self.vocabulary))) words = pd.DataFrame.from_dict(list(self.words.values())).sort_values("final", ascending=False).reset_index(drop=True) return words
from Trie import Trie trie = Trie() trie.insert("apple") result1 = trie.search("apple") result2 = trie.search("app") result3 = trie.startsWith("app") trie.insert("app") result4 = trie.search("ap") print(result1, result2, result3, result4)
class SpellSuggestion(object): def __init__(self, regex=r"[\w]+"): """ initialize the WORDS dictionary which the key is a word and the value is the occurrences of the key """ self.trie = Trie() self.regex = regex with open(english_words, "r") as f: # Create a dictionary for storing all the words and its occurrences self.WORDS = Counter(self.words_token(f.read())) for word in self.WORDS.keys(): # put all the words in Trie self.trie.insert(word) def words_token(self, text): """ extract all the words from text with lowercase """ if text is not None and text != "": return re.findall(self.regex, text.lower(), re.MULTILINE) else: return ["UWindsor"] def probability_of_word(self, word): """ calculate the probability of a given word """ # return self.WORDS[word] / len(self.WORDS.values()) return self.WORDS[word] / sum(self.WORDS.values()) def edit_distance_1(self, word): """ get all combinations from the given word which edit distance is 1 """ letters = string.ascii_lowercase splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] deletes = [L + R[1:] for L, R in splits if R] transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1] replaces = [L + c + R[1:] for L, R in splits if R for c in letters] inserts = [L + c + R for L, R in splits for c in letters] return set(replaces + deletes + transposes + inserts) def edit_distance_2(self, word): """ get all combinations from the given word which edit distance is 2 """ return set(e2 for e1 in self.edit_distance_1(word) for e2 in self.edit_distance_1(e1)) def shown(self, words): """ return words that are shown in WORDS based on edit distance which is ether 1 or 2 """ return set(w for w in words if w in self.WORDS) def candidates(self, word): """ get all the candidate words from the given word """ return self.shown([word]) or self.shown(self.edit_distance_1(word)) or self.shown(self.edit_distance_2(word)) or [word] def correct_word(self, word) -> str: """ get the most probable word suggestion for the given word """ return max(self.candidates(word), key=self.probability_of_word) def spell_checker(self, words) -> str: """ check all the words and return the words with modifications """ tokens = self.words_token(words) return " ".join([self.correct_word(token) for token in tokens]) # def spell_checker(self, word): # """ get the most probable spelling suggestion for the given word. """ # # return sorted(self.candidates(word), key=self.probability_of_word)[0] # return max(self.candidates(word), key=self.probability_of_word) def auto_completer(self, prefix, top=5): """ return number of top auto complete suggestion according the prefix """ return self.trie.autocomplete(prefix, top) # get the top word accourding to the given prefix
def sort_list(arr): trie = Trie() for s in arr: trie.insert(s) return list(find(trie.root, ''))
class SimpleAPI(): def __init__(self): self.titleTrie = Trie() self.categoryNameTrie = Trie() self.brandNameTrie = Trie() self.productDict={} self.productIdDict={} self.titleDict={} self.brandIdDict={} self.brandNameDict={} self.categoryIdDict={} self.categoryNameDict={} self.keywordFrequencyDict={} def _combinations(self, L, final,tmp=None): if tmp is None: tmp = [] if L==[]: final.append(tmp) else: for i in L[0]: self._combinations(L[1:], final,tmp+[i]) return final def _getListOfTypeCombinations(self, conditions): listOfTypes = [] for condition in conditions: listOfTypes.append(self._combinations([[condition['type']], [value.lower() for value in condition['values']]], [])) listOfTypeCombinations = self._combinations(listOfTypes, []) return listOfTypeCombinations def _addToDict(self, d, key, value): if key in d: d.get(key).append(value) else: d[key]=[value] def _addToFreqDict(self, keyList): for keys in keyList.split(): keys = regex.sub(' ', keys).split() for key in keys: if key in self.keywordFrequencyDict: self.keywordFrequencyDict[key] = self.keywordFrequencyDict[key] + 1 else: self.keywordFrequencyDict[key] = 1 def initializeApi(self, fileName): with open(fileName,encoding='utf8') as in_file: for line in in_file: output = [] columns = line.split("\t") for index, c in enumerate(columns): c=c.lower() output.append(c) if index==1: self._addToFreqDict(c) self.titleTrie.insert(c) elif index==3: self.brandNameTrie.insert(c) elif index==5: self.categoryNameTrie.insert(c) self.productIdDict[output[0]]=output[0] self._addToDict(self.titleDict, output[1], output[0]) self._addToDict(self.brandIdDict, output[2], output[0]) self._addToDict(self.brandNameDict, output[3], output[0]) self._addToDict(self.categoryIdDict, output[4], output[0]) self._addToDict(self.categoryNameDict, output[5][:-1], output[0]) self.productDict[output[0]]=output def endpoint1(self, type, prefix): prefix=prefix.lower() if type=='title': return list(self.titleTrie.allWordsStartingWithPrefix(prefix)) elif type=='category': return list(self.categoryNameTrie.allWordsStartingWithPrefix(prefix)) elif type=='brand': return list(self.brandNameTrie.allWordsStartingWithPrefix(prefix)) def endpoint2(self, conditions, pagination): responseHeadings = ['productId', 'title', 'brandId', 'brandName', 'categoryId', 'categoryName'] fromPagination = pagination['from'] sizePagination = pagination['size'] resultingProductIds=[] listOfTypeCombinations = self._getListOfTypeCombinations(conditions) for typeCombination in listOfTypeCombinations: setList=[] for type in typeCombination: setList.append(eval('self.'+type[0]+'Dict.get(\''+type[1]+'\')')) resultingProductIdsByTypeCombination = set.intersection(*list(map(set, setList))) resultingProductIds+=resultingProductIdsByTypeCombination paginatedResultingProductIds = resultingProductIds[fromPagination:fromPagination+sizePagination] searchResults=[] for productId in paginatedResultingProductIds: searchResults.append(dict(zip(responseHeadings, self.productDict[productId]))) return searchResults def endpoint3(self, keywords): searchResult = {} searchResult['keywordFrequencies']=[] for keyword in keywords: keywordFrequency = self.keywordFrequencyDict.get(keyword.lower(), 0) searchResult['keywordFrequencies']=searchResult['keywordFrequencies']+[[keyword, str(keywordFrequency)]] return searchResult def closeEndpoint(self): self.titleTrie = None self.categoryNameTrie = None self.brandNameTrie = None self.productDict={} self.productIdDict={} self.titleDict={} self.brandIdDict={} self.brandNameDict={} self.categoryIdDict={} self.categoryNameDict={} self.keywordFrequencyDict={}
class pinyin(object): def __init__(self, pinyins): self.pinyins = pinyins # 读入所有有效拼音 self.tree = Trie() f = open('pinyin/pinyin_list.txt') # f = open('pinyin_list.txt') for line in f: self.tree.insert(line.split()[0]) f.close() def split(self): ''' 分割函数 @param pinyin: 拼音串 str @return: 分割后的拼音列表 list ''' # 可作为拼音开头的字母 pinyin_initials = [ 'a', 'b', 'e', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w' ] # pinyin_initials = self.tree.root.children iuv = ['i', 'u', 'v'] grn = ['g', 'r', 'n'] input = '' result = [] for i in range(len(self.pinyins)): c = self.pinyins[i] # 读入字符 c input += c # c是 i|u|v,并且是拼音串的首字母 if c in iuv and len(input) == 1: return False, None # 当前拼音有效或者是有效拼音的一部分 if self.tree.find_initial_with(input): continue # c是声母 if c in pinyin_initials: # 前面的拼音为有效拼音 if self.tree.find_initial_with(input[:-1]): # 在c前断开 result.append(input[:-1]) input = input[-1:] continue else: return False, None # 倒数第二个字母为 g|r|n elif input[-2:-1] in grn: # 在 g|r|n 前断开有效 if self.tree.find_initial_with(input[:-2]): # 在 g|r|n 前断开 result.append(input[:-2]) input = input[-2:] continue # 在 g|r|n 后断开有效 elif self.tree.find_initial_with(input[:-1]): # 在 g|r|n 后断开 result.append(input[:-1]) input = input[-1:] continue else: # 单独断开 result.append(input) input = '' result.append(input) return True, result
class pinyin(object): def __init__(self, pinyins): self.pinyins = pinyins # 读入所有有效拼音 self.tree = Trie() f = open('pinyin/pinyin_list.txt') # f = open('pinyin_list.txt') for line in f: self.tree.insert(line.split()[0]) f.close() def split(self): ''' 分割函数 @param pinyin: 拼音串 str @return: 分割后的拼音列表 list ''' # 可作为拼音开头的字母 pinyin_initials = ['a', 'b', 'e', 'p', 'm', 'f', 'd', 't', 'n', 'l', 'g', 'k', 'h', 'j', 'q', 'x', 'r', 'z', 'c', 's', 'y', 'w'] # pinyin_initials = self.tree.root.children iuv = ['i','u','v'] grn = ['g','r','n'] input = '' result = [] for i in range(len(self.pinyins)): c = self.pinyins[i] # 读入字符 c input += c # c是 i|u|v,并且是拼音串的首字母 if c in iuv and len(input)==1: return False,None # 当前拼音有效或者是有效拼音的一部分 if self.tree.find_initial_with(input): continue # c是声母 if c in pinyin_initials: # 前面的拼音为有效拼音 if self.tree.find_initial_with(input[:-1]): # 在c前断开 result.append(input[:-1]) input = input[-1:] continue else: return False,None # 倒数第二个字母为 g|r|n elif input[-2:-1] in grn: # 在 g|r|n 前断开有效 if self.tree.find_initial_with(input[:-2]): # 在 g|r|n 前断开 result.append(input[:-2]) input = input[-2:] continue # 在 g|r|n 后断开有效 elif self.tree.find_initial_with(input[:-1]): # 在 g|r|n 后断开 result.append(input[:-1]) input = input[-1:] continue else: # 单独断开 result.append(input) input = '' result.append(input) return True,result
for line in open(os.path.join(dir,f"ent_ids_{id}"), "r", encoding="utf-8"): line = line.strip().split("\t") eid, url = line[0], line[1] ent = line[1].split("/")[-1] ent_name = ent.split("(")[0].replace("_"," ").strip() ent = "dbpedia/" + ent fout.write(f"{eid}\t{url}\t{ent}\t{ent_name}\n") fout.close() # dump_path trie = Trie() all_ents = set([]) for line in open(dump_path,"r",encoding="utf-8"): # lower line = line.rstrip('\n').split('\t') trie.insert(line[3], line[2]) all_ents.add(line[2]) print("Built trie.") out_file = f"../data/wiki_db/{lang1}_{lang2}/{id2lang[id]}.txt" corpora_file = f"../data/wiki/{id2lang[id]}.txt" num = 0 found = 0 found_entity = set([]) with open(out_file, "w") as f: t0 = time.time() for line in open(corpora_file, "r", encoding="utf-8"): hit4line = 0 num += 1 # lower # line = line.lower()