示例#1
0
def main():
    # Trie allows us to retrieve all words beginning with the designated prefix
    wordTrie = Trie()
    instances = {}

    # get file name from user and open for reading
    filename = input(
        "Please enter file to be indexed (be sure to include .txt extension): "
    )
    while not path.exists(filename):
        print("File does not exist.")
        filename = input(
            "Please enter file to be indexed (be sure to include .txt extension): "
        )
    file = open(filename, "r")

    # read words from file, recording their position along the way
    line_num = 1
    for line in file:
        word_num = 1
        for word in line.split():
            # use regex to convert all words to lowercase and remove special chars
            word = re.sub(r'[^a-zA-Z-]', "", word.lower())
            # store word in trie
            wordTrie.addWord(word)
            # store the instances of word
            if word in instances:
                instances[word].append(str(line_num) + "-" + str(word_num))
            else:
                instances[word] = []
                instances[word].append(str(line_num) + "-" + str(word_num))
            word_num += 1
        line_num += 1

    # allow user to search for desired prefix, printing all words beginning with entered prefix
    while True:
        pre = input(
            "Please enter prefix of word, or entire word, you'd like to search for (Ctrl + C to quit): "
        )
        words = wordTrie.get_prefix(pre)
        if not words:
            print("There are no words beginning with", pre,
                  "- Please try again.")
        else:
            for item in words:
                print(item, str(instances[item]))
示例#2
0
class testTrie(unittest.TestCase):
    def setUp(self):
        self.trie = Trie()
        self.trie.addWord("cat")
    
    def test_check_is_word(self):
        self.assertTrue(self.trie.isWord("cat"))
        self.assertFalse(self.trie.isWord("ca"))
    
    def test_remove_word(self):
        self.trie.removeWord("cat")
        self.assertFalse(self.trie.isWord("cat"))
    
    def test_words_with_shared_letters(self):
        self.trie.addWord("cab")
        self.assertTrue(self.trie.isWord("cat"))
        self.assertFalse(self.trie.isWord("ca"))
        self.assertTrue(self.trie.isWord("cab"))

    def test_node_children(self):
        self.trie.addWord("cab")
        self.trie.addWord("dog")
        self.assertEqual(self.trie._sentinel.children.keys(), ["c","d"])
示例#3
0
def makeTrie(dict_file):
    tree = Trie()
    with open(dict_file, 'r') as f:
        for word in f:
            tree.addWord(word.strip().upper())
    return tree
示例#4
0
 def makeTrie(self, words):
     """Creates a trie from dict_file."""
     trie = Trie()
     for word in words:
         trie.addWord(word)
     return trie
示例#5
0
class Dictionary:

    def __init__(self,json_data = None,u_file = "data/udict.txt",b_file = "data/bdict.txt",HMMfile="data/HMM.json"):
        # 字典树
        self.dictTree = Trie()

        # 一元
        self.u_word = {}
        self.wordN = 0

        # 二元
        self.b_word = {}
        self.bwordN = 0

        # Good-Turing 参数
        self.k = 5
        self.c = [0] * self.k

        # HMM字典
        self.HMM_init = {}
        self.HMM_trans = {}
        self.HMM_emit = {}
        self.__loadHMMJson(HMMfile)

        if not json_data:
            self.__loaddic(u_file,b_file)
        else:
            self.__loadjsondata(json_data)
        pass

    def findWord(self,word):
        return self.dictTree.getWrodN(word)[0]

    def getDAG(self,sentence):
        s_len = len(sentence)

        DAG = [([0] * (s_len+1)) for i in range((s_len+1))]

        for i in xrange(s_len+1):
            s_t = self.findWord(sentence[i:])
            if i != s_len:
                DAG[i][i+1] = 1
            for t in s_t:
                DAG[i][i+t] = 1
        return DAG

    def get2GramProb(self,word1,word2):
        key = word1 + ' ' + word2
        n = self.b_word.get(key,0)
        if n < self.k:
            n = self.c[n]
        return float(n) / self.bwordN

    def get2GramProbLog(self,word1,word2):
        key = word1 + ' ' + word2
        n = self.b_word.get(key,0)
        if n < self.k:
            n = self.c[n]
        return math.log(float(n)) - math.log(self.bwordN)

    def writeJsonData(self,filepath):
        data = {}
        data['root'] = self.dictTree.getData()
        data['u_word'] = self.u_word
        data['wordN'] = self.wordN
        data['b_word'] = self.b_word
        data['bwordN'] = self.bwordN
        data['K'] = self.k
        data['C'] = self.c

        print "writting json file..."
        fp = open(filepath,'a+')
        fp.write(json.dumps(data))
        fp.close()


    def __loadjsondata(self,json_file):
        # loading json file
        print "loading Json file..."
        jsonfp = open(json_file)
        jstr = jsonfp.read()
        data = json.loads(jstr)

        self.dictTree.setData(data['root'])

        self.u_word = data['u_word']
        self.wordN = data['wordN']

        self.b_word = data['b_word']
        self.bwordN = data['bwordN']

        self.c = data['C']
        self.k = data['K']

        # # 计算GoodTuring
        # NC = {}
        # for i in self.b_word.values():
        #     if i in NC.keys():
        #         NC[i] += 1
        #     else:
        #         NC[i] = 1
        # NC[0] = self.wordN * self.wordN - self.bwordN
        #
        # for i in xrange(self.k):
        #     self.c[i] = (((i + 1) * float(NC[i + 1]) / float(NC[i])) - \
        #                  (i * (self.k + 1) * NC[self.k + 1] / NC[1])) / \
        #                 (1 - (self.k + 1) * NC[self.k + 1] / NC[1])

        pass

    def __loaddic(self,u_file,b_file):
        # 读一元词典
        print "load 1gram dict..."
        ufp = open(u_file)
        for line in ufp.readlines():
            line = line.strip()
            if line == "":
                continue
            l = line.split('\t')
            # 防止字典错误
            if len(l)<2:
                continue
            word = l[0].decode('utf-8')
            freq = int(l[1])
            self.u_word[word] = freq
            self.wordN += freq
            self.dictTree.addWord(word)
        ufp.close()

        # 读二元词典
        print "load 2gram dict..."
        bfp = open(b_file)
        for line in bfp.readlines():
            line = line.strip()
            if line == "":
                continue
            l = line.split('\t')
            # 防止字典错误
            if len(l)<2:
                continue
            if len(l[0].split(' '))<2:
                continue
            word = l[0].decode('utf-8')
            freq = int(l[1])
            self.b_word[word] = freq
            self.bwordN += freq
        bfp.close()

        print "%d words." % self.wordN

        # 计算GoodTuring
        NC = {}
        for i in self.b_word.values():
            if i in NC.keys():
                 NC[i] += 1
            else:
                NC[i] = 1
        NC[0] = self.wordN * self.wordN - self.bwordN

        for i in xrange(self.k):
            self.c[i] = (((i+1)*float(NC[i+1])/float(NC[i]))-\
                         (i*(self.k+1)*NC[self.k+1]/NC[1]))/\
                        (1-(self.k+1)*NC[self.k+1]/NC[1])
        pass

    def __loadHMMJson(self,file):
        fp = open(file)
        jstr = fp.read()
        data = json.loads(jstr)
        self.HMM_init = data['init']
        self.HMM_trans = data['trans']
        self.HMM_emit = data['emit']
        fp.close()
示例#6
0
class Wordplay:

  DEFAULT_MAX = 0
  DEFAULT_KEY = lambda x:x

  END_FRONT = 1
  END_REAR = 2
  END_BOTH = 3

  def __init__(self,
    wordlist='wordlists/simple.txt',
    multipleWords=False,
    minWordSize=1):

    self._reverseTrie = Trie()
    self._forwardTrie = Trie()

    self._minWordSize = minWordSize
    self._multipleWords = multipleWords

    fp = open(wordlist)
    for word in fp:
      self._addWord(word)
    fp.close()

  def has(self, word):
    return self._forwardTrie.has(word) and \
      self._reverseTrie.has(word[::-1])

  def pickRandomAnagram(self, cipher):
    """Picks a random anagram of cipher and returns it or None."""
    result = list(self.solveRandomAnagram(cipher,1))
    if len(result) == 0: return None
    return result[0]

  def pickRandomPalindrome(self, cipher):
    result = list(self.solveRandomPalindrome(cipher, 1))
    if len(result) == 0: return None
    return result[0]

  def pickFirst(self, cipher):
    """Picks the first anagram it can find."""
    result = list(self.solve(cipher,1))
    if len(result) == 0: return None
    return result[0]

  def solveRandomPalindrome(self, cipher, maxSolutions=DEFAULT_MAX):
    return self.solvePalindrome(cipher, maxSolutions, lambda x:
      random.random())
  
  def solveRandomAnagram(self, cipher, maxSolutions=DEFAULT_MAX):
    return self.solveAnagram(cipher, maxSolutions, lambda x: random.random())

  def canRecur(self):
    return self._multipleWords

  def solveAnagram(self, cipher, maxSolutions=DEFAULT_MAX, sortKey=DEFAULT_KEY):
    charMap = formatCipher(cipher)
    solutions = 0

    for solution in self._solveAnagramEntry(charMap, sortKey):
      solutions += 1
      yield solution
      if solutions >= maxSolutions and maxSolutions > 0:
        break

  def solvePalindrome(self, cipher, maxSolutions=DEFAULT_MAX,
    sortKey=DEFAULT_KEY):

    if not possiblePalindrome(cipher):
      raise StopIteration

    charMap = formatCipher(cipher)
    solutions = 0

    for solution in self._solvePalindromeEntry(charMap,sortKey):
      solutions += 1
      yield solution
      if solutions >= maxSolutions and maxSolutions > 0:
        break

  def _solvePalindromeEntry(self, charMap, sortKey, froot=None, rroot=None):
    if froot == None: froot = self._forwardTrie._root
    if rroot == None: rroot = self._reverseTrie._root
    keys = set(charMap.keys()) & \
      set(froot.keys()) & \
      set(rroot.keys())

    for key in sorted(keys, key=sortKey):
      fnode = froot._get(key)
      rnode = rroot._get(key)
      for solution in self._solvePalindromeRecursive(charMap, sortKey, fnode,
        rnode):
        yield solution

  def _solvePalindromeRecursive(self, charMap, sortKey, fnode, rnode):
    count = min(charMap[fnode._letter], 2)
    tmpMap = _deductKey(charMap, fnode._letter, count)

    if count == 1 and len(tmpMap) != 0:
      raise StopIteration

    keys = set(tmpMap.keys()) & \
      set(rnode.keys()) & \
      set(fnode.keys())

    if fnode._isTerminal() and self.canRecur():
      keys.add(Wordplay.END_FRONT)

    if rnode._isTerminal() and self.canRecur():
      keys.add(Wordplay.END_REAR)

    if rnode._isTerminal() and fnode._isTerminal() and self.canRecur():
      keys.add(Wordplay.END_BOTH)
    
    if len(tmpMap) == 0 and self._validWord(fnode, rnode, count*fnode._letter):
      yield count*fnode._letter
    elif len(tmpMap) == 0:
      pass
    elif len(keys) == 0:
      pass
    else:
      for key in sorted(keys, key=sortKey):
        solutionGen = None
        prefix = fnode._letter
        suffix = rnode._letter

        if key == Wordplay.END_FRONT:
          solutionGen = self._solvePalindromeEntry(tmpMap, sortKey, None,
            rnode)
          prefix += " "
        elif key == Wordplay.END_REAR:
          solutionGen = self._solvePalindromeEntry(tmpMap, sortKey, fnode,
            None)
          suffix = " " + suffix
        elif key == Wordplay.END_BOTH:
          solutionGen = self._solvePalindromeEntry(tmpMap, sortKey)
          suffix = " " + suffix
          prefix += " "
        else:
          recurFNode = fnode._get(key)
          recurRNode = rnode._get(key)
          solutionGen = self._solvePalindromeRecursive(tmpMap, sortKey,
            recurFNode, recurRNode)

        for subSolution in solutionGen:
          if subSolution != None:
            yield prefix + subSolution + suffix

  def _validWord(self, fnode, rnode, middle):
    while fnode._parent and fnode._parent._letter != None:
      fnode = fnode._parent
      middle = fnode._letter + middle

    while rnode._parent and rnode._parent._letter != None:
      rnode = rnode._parent
      middle = middle + rnode._letter

    return self.has(middle)

  def _solveAnagramEntry(self, charMap, sortKey):
    root = self._forwardTrie._root
    keys = set(charMap.keys()) & set(root.keys())

    for key in sorted(keys, key=sortKey):
      node = root._get(key)
      for solution in self._solveAnagramRecursive(charMap, sortKey, node):
        yield solution

  def _solveAnagramRecursive(self, charmap, sortKey, node):
    tmpMap = _deductKey(charmap, node._letter)
    keys = set(tmpMap.keys()) & set(node._nextMap.keys())

    if node._isTerminal() and self.canRecur():
      keys.add(None)

    if len(tmpMap) == 0 and node._isTerminal():
      yield node._letter
    elif len(tmpMap) == 0:
      pass
    elif len(keys) == 0:
      pass
    else:
      for key in sorted(keys, key=sortKey):
        solutionGen = None
        prefix = node._letter

        if key == None:
          solutionGen = self._solveAnagramEntry(tmpMap, sortKey)
          prefix += " "
        else:
          recurNode = node._get(key)
          solutionGen = self._solveAnagramRecursive(tmpMap, sortKey, recurNode)

        for subSolution in solutionGen:
          if subSolution != None:
            yield prefix + subSolution
  


  def _addWord(self, word):
    word = word.strip().upper()
    if len(word) < self._minWordSize:
      return
    self._forwardTrie.addWord(word)
    self._reverseTrie.addWord(word[::-1])