Python NgramLM примеры использования

Язык программирования: Python

Пространство имен/Пакет: NgramLM

Класс/Тип: NgramLM

Примеров на hotexamples.com: 4

Python NgramLM - 4 примера найдено. Это лучшие примеры Python кода для NgramLM.NgramLM, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

addNgramCount(2)

NgramLM(1)

addNgramProb(1)

mlEstimate(1)

saveNgramInfo(1)

writeMessage(1)

Пример #1

Показать файл

Файл: TestNgramLM.py Проект: hitochan777/depLM

 def test_addNgramCount(self):
     order = 3
     lm = NgramLM(order)
     li = "who knows who can do that ?".split()
     for i in xrange(len(li) - order + 1):
         lm.addNgramCount(li[i:i+order])
     self.assertEqual(len(lm._root.count), len(li) - order + 1)
     self.assertEqual(len(lm._root.children), len(li) - order + 1)
     for i in xrange(order - 2, len(li) - 1):
         self.assertTrue(lm._root.children.has_key(li[i]))

Пример #2

Показать файл

Файл: build_test_LM.py Проект: XJDKC/NUS-CS3245

def build_LM(in_file):
    """
    build language models for each label
    each line in in_file contains a label and a string separated by a space
    """
    print('building language models...')
    # This is an empty method
    # Pls implement your code below in

    # LMs is a dict to store the LM for each language
    LMs = {}
    with open(in_file, 'r') as file:
        for line in file:
            (label, text) = line.strip("\r\n").split(" ", 1)
            if label not in LMs:
                LMs[label] = NgramLM(label,
                                     gram_size=gram_size,
                                     token_based=token_based,
                                     start_end=start_end,
                                     case_sensitive=case_sensitive,
                                     strip_out=strip_out,
                                     add_one_smoothing=add_one_smoothing)
            LMs[label].train(text)

    return LMs

Пример #3

Показать файл

Файл: DependencyLM.py Проект: hitochan777/depLM

 def __init__(self, countkey = lambda tree: tree.data["pos"], smoothing="ml"):
     self.probHead = NgramLM(1)
     self.probLeft = NgramLM(3)
     self.probRight = NgramLM(3)
     self.countkey = countkey
     self.smoothing = smoothing

Пример #4

Показать файл

Файл: DependencyLM.py Проект: hitochan777/depLM

class DependencyLM(object):

    def __init__(self, countkey = lambda tree: tree.data["pos"], smoothing="ml"):
        self.probHead = NgramLM(1)
        self.probLeft = NgramLM(3)
        self.probRight = NgramLM(3)
        self.countkey = countkey
        self.smoothing = smoothing

    def train(self, filename, modelFile, progress=10000):
        """
        filename: Filename of CONLL format
        progress: print dot(.) every #progress trees; default: 10000
        """
        cnt = 0
        print "Model training started"
        for depString in dph.readDependencyFile(filename):
            cnt += 1
            if cnt % progress == 0:
                sys.stdout.write(".")
            tree = dph.stringToDependencyTreeWeakRef(depString)
            if tree is None:
                continue
            self.countFreq(tree)
        if self.smoothing == "ml":
            # Apppy maximum likelihood estimation to get probability from counts
            self.probHead.mlEstimate()
            self.probLeft.mlEstimate()
            self.probRight.mlEstimate()
        else:
            raise NotImplemented("Currently only maximum likelihood is supported as a smoothing method (though ML is not smoothing technique)")

        if cnt >= progress:
            print # Without this, when there are less than #progress trees, there will be an empty line, which is ugly in my opinion...
        print "Model training has successfully finished!"
        print "Writing model infomation to %s" % (modelFile, )
        self.saveModelAsPlainText(modelFile)
        print "Finished writing model information to %s" % (modelFile, )

    def saveModelAsPlainText(self, filename):
        with open(filename, "w") as model:
           model.write("[probHead]\n")
           self.probHead.saveNgramInfo(fstream=model)
           model.write("[probLeft]\n")
           self.probLeft.saveNgramInfo(fstream=model)
           model.write("[probRight]\n")
           self.probRight.saveNgramInfo(fstream=model)

    def readModelFromPlainText(self, file):
        isString = False
        if isinstance(file, basestring):
            model = open(file, "r")
            isString = True
        else:
            model = file
        
        for line in model:
            line = line.strip()
            if line in ["[probHead]", "[probLeft]", "[probRight]"]:
                state = line
                continue

            if line == "":
                continue

            ngram, count, logProb = line.split("\t")
            ngram = ngram.split(" ")
            count = int(count)
            if state == "[probHead]":
                self.probHead.addNgramCount(ngram, count)

            elif state == "[probLeft]":
                self.probLeft.addNgramCount(ngram, count)
            
            elif state == "[probRight]":
                self.probRight.addNgramCount(ngram, count)

        if self.smoothing == "ml":
            # Apppy maximum likelihood estimation to get probability from counts
            self.probHead.mlEstimate()
            self.probLeft.mlEstimate()
            self.probRight.mlEstimate()

        else:
            raise NotImplemented("Currently only maximum likelihood is supported as a smoothing method (though ML is not smoothing technique)")

        if isString: # If filestream is opened in this function, close fstream
            model.close()


    def saveModelAsProtocolBuffer(self, filename):
        with open(filename, "wb") as model:
            lmpb = depLM_pb2.depLM()
            self.probHead.writeMessage(lmpb.probHead.ngramEntries)
            self.probLeft.writeMessage(lmpb.probLeft.ngramEntries)
            self.probRight.writeMessage(lmpb.probRight.ngramEntries)
            model.write(lmpb.SerializeToString())
             
    def readModelFromProtocolBuffer(self, file):
        isString = False
        if isinstance(file, basestring):
            model = open(file, "rb")
            isString = True
        else:
            model = file
        
        lmpb = depLM_pb2.depLM()
        lmpb.ParseFromString(model.read())
        for ngramEntry in lmpb.probHead.ngramEntries:
            self.probHead.addNgramProb(ngramEntry.ngram, ngramEntry.prob)
            self.probHead.addNgramCount(ngramEntry.ngram, ngramEntry.count)
        for ngramEntry in lmpb.probLeft.ngramEntries:
            self.probLeft.addNgramProb(ngramEntry.ngram, ngramEntry.prob)
            self.probLeft.addNgramCount(ngramEntry.ngram, ngramEntry.count)
        for ngramEntry in lmpb.probRight.ngramEntries:
            self.probRight.addNgramProb(ngramEntry.ngram, ngramEntry.prob)
            self.probRight.addNgramCount(ngramEntry.ngram, ngramEntry.count)
        if isString: # If filestream is opened in this function, close fstream
            model.close()

    def countFreq(self, node):
        if node.parent is None:
            self.probHead.addNgramCount([self.countkey(node)])
        left, right = node.partitionChildren()

        if len(left) > 0:
            self.probLeft.addNgramCount(["___none", self.countkey(left[0].parent)+"___head", self.countkey(left[0])])
        for index in xrange(1, len(left)):
            self.probLeft.addNgramCount([self.countkey(left[index-1]), self.countkey(left[index].parent)+"___head", self.countkey(left[index])])

        if len(right) > 0:
            self.probRight.addNgramCount(["___none", self.countkey(right[0].parent)+"___head", self.countkey(right[0])])

        for index in xrange(1, len(right)):
            self.probRight.addNgramCount([self.countkey(right[index-1]), self.countkey(right[index].parent)+"___head", self.countkey(right[index])])

        for child in node.children:
            self.countFreq(child)