def main(): # Trie allows us to retrieve all words beginning with the designated prefix wordTrie = Trie() instances = {} # get file name from user and open for reading filename = input( "Please enter file to be indexed (be sure to include .txt extension): " ) while not path.exists(filename): print("File does not exist.") filename = input( "Please enter file to be indexed (be sure to include .txt extension): " ) file = open(filename, "r") # read words from file, recording their position along the way line_num = 1 for line in file: word_num = 1 for word in line.split(): # use regex to convert all words to lowercase and remove special chars word = re.sub(r'[^a-zA-Z-]', "", word.lower()) # store word in trie wordTrie.addWord(word) # store the instances of word if word in instances: instances[word].append(str(line_num) + "-" + str(word_num)) else: instances[word] = [] instances[word].append(str(line_num) + "-" + str(word_num)) word_num += 1 line_num += 1 # allow user to search for desired prefix, printing all words beginning with entered prefix while True: pre = input( "Please enter prefix of word, or entire word, you'd like to search for (Ctrl + C to quit): " ) words = wordTrie.get_prefix(pre) if not words: print("There are no words beginning with", pre, "- Please try again.") else: for item in words: print(item, str(instances[item]))
class testTrie(unittest.TestCase): def setUp(self): self.trie = Trie() self.trie.addWord("cat") def test_check_is_word(self): self.assertTrue(self.trie.isWord("cat")) self.assertFalse(self.trie.isWord("ca")) def test_remove_word(self): self.trie.removeWord("cat") self.assertFalse(self.trie.isWord("cat")) def test_words_with_shared_letters(self): self.trie.addWord("cab") self.assertTrue(self.trie.isWord("cat")) self.assertFalse(self.trie.isWord("ca")) self.assertTrue(self.trie.isWord("cab")) def test_node_children(self): self.trie.addWord("cab") self.trie.addWord("dog") self.assertEqual(self.trie._sentinel.children.keys(), ["c","d"])
def makeTrie(dict_file): tree = Trie() with open(dict_file, 'r') as f: for word in f: tree.addWord(word.strip().upper()) return tree
def makeTrie(self, words): """Creates a trie from dict_file.""" trie = Trie() for word in words: trie.addWord(word) return trie
class Dictionary: def __init__(self,json_data = None,u_file = "data/udict.txt",b_file = "data/bdict.txt",HMMfile="data/HMM.json"): # 字典树 self.dictTree = Trie() # 一元 self.u_word = {} self.wordN = 0 # 二元 self.b_word = {} self.bwordN = 0 # Good-Turing 参数 self.k = 5 self.c = [0] * self.k # HMM字典 self.HMM_init = {} self.HMM_trans = {} self.HMM_emit = {} self.__loadHMMJson(HMMfile) if not json_data: self.__loaddic(u_file,b_file) else: self.__loadjsondata(json_data) pass def findWord(self,word): return self.dictTree.getWrodN(word)[0] def getDAG(self,sentence): s_len = len(sentence) DAG = [([0] * (s_len+1)) for i in range((s_len+1))] for i in xrange(s_len+1): s_t = self.findWord(sentence[i:]) if i != s_len: DAG[i][i+1] = 1 for t in s_t: DAG[i][i+t] = 1 return DAG def get2GramProb(self,word1,word2): key = word1 + ' ' + word2 n = self.b_word.get(key,0) if n < self.k: n = self.c[n] return float(n) / self.bwordN def get2GramProbLog(self,word1,word2): key = word1 + ' ' + word2 n = self.b_word.get(key,0) if n < self.k: n = self.c[n] return math.log(float(n)) - math.log(self.bwordN) def writeJsonData(self,filepath): data = {} data['root'] = self.dictTree.getData() data['u_word'] = self.u_word data['wordN'] = self.wordN data['b_word'] = self.b_word data['bwordN'] = self.bwordN data['K'] = self.k data['C'] = self.c print "writting json file..." fp = open(filepath,'a+') fp.write(json.dumps(data)) fp.close() def __loadjsondata(self,json_file): # loading json file print "loading Json file..." jsonfp = open(json_file) jstr = jsonfp.read() data = json.loads(jstr) self.dictTree.setData(data['root']) self.u_word = data['u_word'] self.wordN = data['wordN'] self.b_word = data['b_word'] self.bwordN = data['bwordN'] self.c = data['C'] self.k = data['K'] # # 计算GoodTuring # NC = {} # for i in self.b_word.values(): # if i in NC.keys(): # NC[i] += 1 # else: # NC[i] = 1 # NC[0] = self.wordN * self.wordN - self.bwordN # # for i in xrange(self.k): # self.c[i] = (((i + 1) * float(NC[i + 1]) / float(NC[i])) - \ # (i * (self.k + 1) * NC[self.k + 1] / NC[1])) / \ # (1 - (self.k + 1) * NC[self.k + 1] / NC[1]) pass def __loaddic(self,u_file,b_file): # 读一元词典 print "load 1gram dict..." ufp = open(u_file) for line in ufp.readlines(): line = line.strip() if line == "": continue l = line.split('\t') # 防止字典错误 if len(l)<2: continue word = l[0].decode('utf-8') freq = int(l[1]) self.u_word[word] = freq self.wordN += freq self.dictTree.addWord(word) ufp.close() # 读二元词典 print "load 2gram dict..." bfp = open(b_file) for line in bfp.readlines(): line = line.strip() if line == "": continue l = line.split('\t') # 防止字典错误 if len(l)<2: continue if len(l[0].split(' '))<2: continue word = l[0].decode('utf-8') freq = int(l[1]) self.b_word[word] = freq self.bwordN += freq bfp.close() print "%d words." % self.wordN # 计算GoodTuring NC = {} for i in self.b_word.values(): if i in NC.keys(): NC[i] += 1 else: NC[i] = 1 NC[0] = self.wordN * self.wordN - self.bwordN for i in xrange(self.k): self.c[i] = (((i+1)*float(NC[i+1])/float(NC[i]))-\ (i*(self.k+1)*NC[self.k+1]/NC[1]))/\ (1-(self.k+1)*NC[self.k+1]/NC[1]) pass def __loadHMMJson(self,file): fp = open(file) jstr = fp.read() data = json.loads(jstr) self.HMM_init = data['init'] self.HMM_trans = data['trans'] self.HMM_emit = data['emit'] fp.close()
class Wordplay: DEFAULT_MAX = 0 DEFAULT_KEY = lambda x:x END_FRONT = 1 END_REAR = 2 END_BOTH = 3 def __init__(self, wordlist='wordlists/simple.txt', multipleWords=False, minWordSize=1): self._reverseTrie = Trie() self._forwardTrie = Trie() self._minWordSize = minWordSize self._multipleWords = multipleWords fp = open(wordlist) for word in fp: self._addWord(word) fp.close() def has(self, word): return self._forwardTrie.has(word) and \ self._reverseTrie.has(word[::-1]) def pickRandomAnagram(self, cipher): """Picks a random anagram of cipher and returns it or None.""" result = list(self.solveRandomAnagram(cipher,1)) if len(result) == 0: return None return result[0] def pickRandomPalindrome(self, cipher): result = list(self.solveRandomPalindrome(cipher, 1)) if len(result) == 0: return None return result[0] def pickFirst(self, cipher): """Picks the first anagram it can find.""" result = list(self.solve(cipher,1)) if len(result) == 0: return None return result[0] def solveRandomPalindrome(self, cipher, maxSolutions=DEFAULT_MAX): return self.solvePalindrome(cipher, maxSolutions, lambda x: random.random()) def solveRandomAnagram(self, cipher, maxSolutions=DEFAULT_MAX): return self.solveAnagram(cipher, maxSolutions, lambda x: random.random()) def canRecur(self): return self._multipleWords def solveAnagram(self, cipher, maxSolutions=DEFAULT_MAX, sortKey=DEFAULT_KEY): charMap = formatCipher(cipher) solutions = 0 for solution in self._solveAnagramEntry(charMap, sortKey): solutions += 1 yield solution if solutions >= maxSolutions and maxSolutions > 0: break def solvePalindrome(self, cipher, maxSolutions=DEFAULT_MAX, sortKey=DEFAULT_KEY): if not possiblePalindrome(cipher): raise StopIteration charMap = formatCipher(cipher) solutions = 0 for solution in self._solvePalindromeEntry(charMap,sortKey): solutions += 1 yield solution if solutions >= maxSolutions and maxSolutions > 0: break def _solvePalindromeEntry(self, charMap, sortKey, froot=None, rroot=None): if froot == None: froot = self._forwardTrie._root if rroot == None: rroot = self._reverseTrie._root keys = set(charMap.keys()) & \ set(froot.keys()) & \ set(rroot.keys()) for key in sorted(keys, key=sortKey): fnode = froot._get(key) rnode = rroot._get(key) for solution in self._solvePalindromeRecursive(charMap, sortKey, fnode, rnode): yield solution def _solvePalindromeRecursive(self, charMap, sortKey, fnode, rnode): count = min(charMap[fnode._letter], 2) tmpMap = _deductKey(charMap, fnode._letter, count) if count == 1 and len(tmpMap) != 0: raise StopIteration keys = set(tmpMap.keys()) & \ set(rnode.keys()) & \ set(fnode.keys()) if fnode._isTerminal() and self.canRecur(): keys.add(Wordplay.END_FRONT) if rnode._isTerminal() and self.canRecur(): keys.add(Wordplay.END_REAR) if rnode._isTerminal() and fnode._isTerminal() and self.canRecur(): keys.add(Wordplay.END_BOTH) if len(tmpMap) == 0 and self._validWord(fnode, rnode, count*fnode._letter): yield count*fnode._letter elif len(tmpMap) == 0: pass elif len(keys) == 0: pass else: for key in sorted(keys, key=sortKey): solutionGen = None prefix = fnode._letter suffix = rnode._letter if key == Wordplay.END_FRONT: solutionGen = self._solvePalindromeEntry(tmpMap, sortKey, None, rnode) prefix += " " elif key == Wordplay.END_REAR: solutionGen = self._solvePalindromeEntry(tmpMap, sortKey, fnode, None) suffix = " " + suffix elif key == Wordplay.END_BOTH: solutionGen = self._solvePalindromeEntry(tmpMap, sortKey) suffix = " " + suffix prefix += " " else: recurFNode = fnode._get(key) recurRNode = rnode._get(key) solutionGen = self._solvePalindromeRecursive(tmpMap, sortKey, recurFNode, recurRNode) for subSolution in solutionGen: if subSolution != None: yield prefix + subSolution + suffix def _validWord(self, fnode, rnode, middle): while fnode._parent and fnode._parent._letter != None: fnode = fnode._parent middle = fnode._letter + middle while rnode._parent and rnode._parent._letter != None: rnode = rnode._parent middle = middle + rnode._letter return self.has(middle) def _solveAnagramEntry(self, charMap, sortKey): root = self._forwardTrie._root keys = set(charMap.keys()) & set(root.keys()) for key in sorted(keys, key=sortKey): node = root._get(key) for solution in self._solveAnagramRecursive(charMap, sortKey, node): yield solution def _solveAnagramRecursive(self, charmap, sortKey, node): tmpMap = _deductKey(charmap, node._letter) keys = set(tmpMap.keys()) & set(node._nextMap.keys()) if node._isTerminal() and self.canRecur(): keys.add(None) if len(tmpMap) == 0 and node._isTerminal(): yield node._letter elif len(tmpMap) == 0: pass elif len(keys) == 0: pass else: for key in sorted(keys, key=sortKey): solutionGen = None prefix = node._letter if key == None: solutionGen = self._solveAnagramEntry(tmpMap, sortKey) prefix += " " else: recurNode = node._get(key) solutionGen = self._solveAnagramRecursive(tmpMap, sortKey, recurNode) for subSolution in solutionGen: if subSolution != None: yield prefix + subSolution def _addWord(self, word): word = word.strip().upper() if len(word) < self._minWordSize: return self._forwardTrie.addWord(word) self._reverseTrie.addWord(word[::-1])