Пример #1
0
 def test_addNgramCount(self):
     order = 3
     lm = NgramLM(order)
     li = "who knows who can do that ?".split()
     for i in xrange(len(li) - order + 1):
         lm.addNgramCount(li[i:i+order])
     self.assertEqual(len(lm._root.count), len(li) - order + 1)
     self.assertEqual(len(lm._root.children), len(li) - order + 1)
     for i in xrange(order - 2, len(li) - 1):
         self.assertTrue(lm._root.children.has_key(li[i]))
Пример #2
0
def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and a string separated by a space
    """
    print('building language models...')
    # This is an empty method
    # Pls implement your code below in

    # LMs is a dict to store the LM for each language
    LMs = {}
    with open(in_file, 'r') as file:
        for line in file:
            (label, text) = line.strip("\r\n").split(" ", 1)
            if label not in LMs:
                LMs[label] = NgramLM(label,
                                     gram_size=gram_size,
                                     token_based=token_based,
                                     start_end=start_end,
                                     case_sensitive=case_sensitive,
                                     strip_out=strip_out,
                                     add_one_smoothing=add_one_smoothing)
            LMs[label].train(text)

    return LMs
Пример #3
0
 def __init__(self, countkey = lambda tree: tree.data["pos"], smoothing="ml"):
     self.probHead = NgramLM(1)
     self.probLeft = NgramLM(3)
     self.probRight = NgramLM(3)
     self.countkey = countkey
     self.smoothing = smoothing
Пример #4
0
class DependencyLM(object):

    def __init__(self, countkey = lambda tree: tree.data["pos"], smoothing="ml"):
        self.probHead = NgramLM(1)
        self.probLeft = NgramLM(3)
        self.probRight = NgramLM(3)
        self.countkey = countkey
        self.smoothing = smoothing

    def train(self, filename, modelFile, progress=10000):
        """
        filename: Filename of CONLL format
        progress: print dot(.) every #progress trees; default: 10000
        """
        cnt = 0
        print "Model training started"
        for depString in dph.readDependencyFile(filename):
            cnt += 1
            if cnt % progress == 0:
                sys.stdout.write(".")
            tree = dph.stringToDependencyTreeWeakRef(depString)
            if tree is None:
                continue
            self.countFreq(tree)
        if self.smoothing == "ml":
            # Apppy maximum likelihood estimation to get probability from counts
            self.probHead.mlEstimate()
            self.probLeft.mlEstimate()
            self.probRight.mlEstimate()
        else:
            raise NotImplemented("Currently only maximum likelihood is supported as a smoothing method (though ML is not smoothing technique)")

        if cnt >= progress:
            print # Without this, when there are less than #progress trees, there will be an empty line, which is ugly in my opinion...
        print "Model training has successfully finished!"
        print "Writing model infomation to %s" % (modelFile, )
        self.saveModelAsPlainText(modelFile)
        print "Finished writing model information to %s" % (modelFile, )

    def saveModelAsPlainText(self, filename):
        with open(filename, "w") as model:
           model.write("[probHead]\n")
           self.probHead.saveNgramInfo(fstream=model)
           model.write("[probLeft]\n")
           self.probLeft.saveNgramInfo(fstream=model)
           model.write("[probRight]\n")
           self.probRight.saveNgramInfo(fstream=model)

    def readModelFromPlainText(self, file):
        isString = False
        if isinstance(file, basestring):
            model = open(file, "r")
            isString = True
        else:
            model = file
        
        for line in model:
            line = line.strip()
            if line in ["[probHead]", "[probLeft]", "[probRight]"]:
                state = line
                continue

            if line == "":
                continue

            ngram, count, logProb = line.split("\t")
            ngram = ngram.split(" ")
            count = int(count)
            if state == "[probHead]":
                self.probHead.addNgramCount(ngram, count)

            elif state == "[probLeft]":
                self.probLeft.addNgramCount(ngram, count)
            
            elif state == "[probRight]":
                self.probRight.addNgramCount(ngram, count)

        if self.smoothing == "ml":
            # Apppy maximum likelihood estimation to get probability from counts
            self.probHead.mlEstimate()
            self.probLeft.mlEstimate()
            self.probRight.mlEstimate()

        else:
            raise NotImplemented("Currently only maximum likelihood is supported as a smoothing method (though ML is not smoothing technique)")

        if isString: # If filestream is opened in this function, close fstream
            model.close()


    def saveModelAsProtocolBuffer(self, filename):
        with open(filename, "wb") as model:
            lmpb = depLM_pb2.depLM()
            self.probHead.writeMessage(lmpb.probHead.ngramEntries)
            self.probLeft.writeMessage(lmpb.probLeft.ngramEntries)
            self.probRight.writeMessage(lmpb.probRight.ngramEntries)
            model.write(lmpb.SerializeToString())
             
    def readModelFromProtocolBuffer(self, file):
        isString = False
        if isinstance(file, basestring):
            model = open(file, "rb")
            isString = True
        else:
            model = file
        
        lmpb = depLM_pb2.depLM()
        lmpb.ParseFromString(model.read())
        for ngramEntry in lmpb.probHead.ngramEntries:
            self.probHead.addNgramProb(ngramEntry.ngram, ngramEntry.prob)
            self.probHead.addNgramCount(ngramEntry.ngram, ngramEntry.count)
        for ngramEntry in lmpb.probLeft.ngramEntries:
            self.probLeft.addNgramProb(ngramEntry.ngram, ngramEntry.prob)
            self.probLeft.addNgramCount(ngramEntry.ngram, ngramEntry.count)
        for ngramEntry in lmpb.probRight.ngramEntries:
            self.probRight.addNgramProb(ngramEntry.ngram, ngramEntry.prob)
            self.probRight.addNgramCount(ngramEntry.ngram, ngramEntry.count)
        if isString: # If filestream is opened in this function, close fstream
            model.close()

    def countFreq(self, node):
        if node.parent is None:
            self.probHead.addNgramCount([self.countkey(node)])
        left, right = node.partitionChildren()

        if len(left) > 0:
            self.probLeft.addNgramCount(["___none", self.countkey(left[0].parent)+"___head", self.countkey(left[0])])
        for index in xrange(1, len(left)):
            self.probLeft.addNgramCount([self.countkey(left[index-1]), self.countkey(left[index].parent)+"___head", self.countkey(left[index])])

        if len(right) > 0:
            self.probRight.addNgramCount(["___none", self.countkey(right[0].parent)+"___head", self.countkey(right[0])])

        for index in xrange(1, len(right)):
            self.probRight.addNgramCount([self.countkey(right[index-1]), self.countkey(right[index].parent)+"___head", self.countkey(right[index])])

        for child in node.children:
            self.countFreq(child)