Exemplos de NGram.NGram em Python, exemplos de NGram.NGram.NGram.NGram em Python

Exemplo n.º 1

0

Exibir arquivo

 def loadModel(self):
     """
     Method to load unigrams and bigrams.
     """
     super().loadModel()
     self.wordBiGramModel = NGram("words2.txt")
     self.igBiGramModel = NGram("igs2.txt")

Exemplo n.º 2

0

Exibir arquivo

    def train(self, corpus: DisambiguationCorpus):
        """
        The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and
        igBiGramModel. It gets the sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the
        word together with its part of speech tags to the wordUniGramModel. It also gets the transition list of that
        word and adds it to the igUniGramModel.

        If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the
        wordBiGramModel with their part of speech tags. It also adds them to the igBiGramModel with their transition
        lists.

        At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing,
        and both word and ig bigram models by using InterpolatedSmoothing.

        PARAMETERS
        ----------
        corpus : DisambiguationCorpus
            DisambiguationCorpus to train.
        """
        words1 = [None]
        igs1 = [None]
        words2 = [None, None]
        igs2 = [None, None]
        self.wordUniGramModel = NGram(1)
        self.wordBiGramModel = NGram(2)
        self.igUniGramModel = NGram(1)
        self.igBiGramModel = NGram(2)
        for sentence in corpus.sentences:
            for j in range(sentence.wordCount()):
                word = sentence.getWord(j)
                if isinstance(word, DisambiguatedWord):
                    words1[0] = word.getParse().getWordWithPos()
                    self.wordUniGramModel.addNGram(words1)
                    igs1[0] = Word(word.getParse().getTransitionList())
                    self.igUniGramModel.addNGram(igs1)
                    if j + 1 < sentence.wordCount():
                        words2[0] = words1[0]
                        words2[1] = sentence.getWord(
                            j + 1).getParse().getWordWithPos()
                        self.wordBiGramModel.addNGram(words2)
                        igs2[0] = igs1[0]
                        igs2[1] = Word(
                            sentence.getWord(j +
                                             1).getParse().getTransitionList())
                        self.igBiGramModel.addNGram(igs2)
        self.wordUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.wordBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())

Exemplo n.º 3

0

Exibir arquivo

Arquivo: NGramTest.py Projeto: whatyouknow123/NGram-Py

 def test_LoadMultiPart(self):
     self.simpleUniGram = NGram(1)
     self.simpleUniGram.initWithMultipleFile("simple1part1.txt",
                                             "simple1part2.txt")
     self.simpleBiGram = NGram(2)
     self.simpleBiGram.initWithMultipleFile("simple2part1.txt",
                                            "simple2part2.txt",
                                            "simple2part3.txt")
     self.simpleTriGram = NGram(3)
     self.simpleTriGram.initWithMultipleFile("simple3part1.txt",
                                             "simple3part2.txt",
                                             "simple3part3.txt",
                                             "simple3part4.txt")
     self.test_GetCountSimple()
     self.test_VocabularySizeSimple()

Exemplo n.º 4

0

Exibir arquivo

 def test_SpellCheck(self):
     original = [Sentence("demokratik cumhuriyet en kıymetli varlığımızdır"),
             Sentence("bu tablodaki değerler zedelenmeyecektir"),
             Sentence("milliyet'in geleneksel yılın sporcusu anketi 43. yaşını doldurdu"),
             Sentence("demokrasinin icadı bu ayrımı bulandırdı"),
             Sentence("dışişleri müsteşarı Öymen'in 1997'nin ilk aylarında Bağdat'a gitmesi öngörülüyor"),
             Sentence("büyüdü , palazlandı , devleti ele geçirdi"),
             Sentence("her maskenin ciltte kalma süresi farklıdır"),
             Sentence("yılın son ayında 10 gazeteci gözaltına alındı"),
             Sentence("iki pilotun kullandığı uçakta bir hostes görev alıyor"),
             Sentence("son derece kısıtlı kelimeler çerçevesinde kendilerini uzun cümlelerle ifade edebiliyorlar")]
     modified = [Sentence("demokratik cumhüriyet en kımetli varlıgımızdır"),
             Sentence("bu tblodaki değerlğr zedelenmeyecüktir"),
             Sentence("milliyet'in geeneksel yılın spoşcusu ankşti 43. yeşını doldürdu"),
             Sentence("demokrasinin icşdı bu ayrmıı bulandürdı"),
             Sentence("dışişleri mütseşarı Öymen'in 1997'nin ilk aylğrında Bağdat'a gitmesi öngşrülüyor"),
             Sentence("büyüdü , palazandı , devltei ele geçridi"),
             Sentence("her makenin cültte kalma sürdsi farlkıdır"),
             Sentence("yılın sno ayında 10 gazteci gözlatına alündı"),
             Sentence("iki piotun kulçandığı uçkata üir hotes görçv alyıor"),
             Sentence("son deece kısütlı keilmeler çeçevesinde kendülerini uzuü cümllerle ifüde edbeiliyorlar")]
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram)
     for i in range(len(modified)):
         self.assertEqual(original[i].toString(), nGramSpellChecker.spellCheck(modified[i]).toString())

Exemplo n.º 5

0

Exibir arquivo

    def learnParameters(self, corpus: list, N: int):
        """
        Wrapper function to learn the parameters (lambda1 and lambda2) in interpolated smoothing. The function first
        creates K NGrams with the train folds of the corpus. Then optimizes lambdas with respect to the test folds of
        the corpus depending on given N.

        PARAMETERS
        ----------
        corpus : list
            Train corpus used to optimize lambda parameters
        N : int
            N in N-Gram.
        """
        if N <= 1:
            return
        K = 10
        nGrams = []
        kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0)
        for i in range(K):
            nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i)))
            for j in range(2, N + 1):
                nGrams[i].calculateNGramProbabilitiesSimpleLevel(
                    self.__simpleSmoothing, j)
            nGrams[i].calculateNGramProbabilitiesSimpleLevel(
                self.__simpleSmoothing, 1)
        if N == 2:
            self.__lambda1 = self.__learnBestLambda(nGrams,
                                                    kFoldCrossValidation, 0.1)
        elif N == 3:
            (self.__lambda1,
             self.__lambda2) = self.__learnBestLambdas(nGrams,
                                                       kFoldCrossValidation,
                                                       0.1, 0.1)

Exemplo n.º 6

0

Exibir arquivo

    def train(self, corpus: DisambiguationCorpus):
        """
        The train method gets sentences from given DisambiguationCorpus and both word and the next word of that sentence
        at each iteration. Then, adds these words together with their part of speech tags to word unigram and bigram
        models. It also adds the last inflectional group of word to the ig unigram and bigram models.

        At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing,
        and both word and ig bigram models by using InterpolatedSmoothing.

        PARAMETERS
        ----------
        corpus : DisambiguationCorpus
            DisambiguationCorpus to train.
        """
        words1 = [None]
        igs1 = [None]
        words2 = [None, None]
        igs2 = [None, None]
        self.wordUniGramModel = NGram(1)
        self.igUniGramModel = NGram(1)
        self.wordBiGramModel = NGram(2)
        self.igBiGramModel = NGram(2)
        for sentence in corpus.sentences:
            for j in range(sentence.wordCount() - 1):
                word = sentence.getWord(j)
                nextWord = sentence.getWord(j + 1)
                words2[0] = word.getParse().getWordWithPos()
                words1[0] = words2[0]
                words2[1] = nextWord.getParse().getWordWithPos()
                self.wordUniGramModel.addNGram(words1)
                self.wordBiGramModel.addNGram(words2)
                for k in range(nextWord.getParse().size()):
                    igs2[0] = Word(
                        word.getParse().getLastInflectionalGroup().__str__())
                    igs2[1] = Word(
                        nextWord.getParse().getInflectionalGroup(k).__str__())
                    self.igBiGramModel.addNGram(igs2)
                    igs1[0] = igs2[1]
                    self.igUniGramModel.addNGram(igs1)
        self.wordUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.wordBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())

Exemplo n.º 7

0

Exibir arquivo

Arquivo: NGramTest.py Projeto: whatyouknow123/NGram-Py

 def test_Merge(self):
     self.simpleUniGram = NGram("simple1a.txt")
     self.simpleUniGram.merge(NGram("simple1b.txt"))
     self.assertEqual(18, self.simpleUniGram.vocabularySize())
     self.simpleBiGram = NGram("simple2a.txt")
     self.simpleBiGram.merge(NGram("simple2b.txt"))
     self.simpleBiGram.merge(NGram("simple2c.txt"))
     self.simpleBiGram.merge(NGram("simple2d.txt"))
     self.assertEqual(21, self.simpleBiGram.vocabularySize())
     self.simpleTriGram = NGram("simple3a.txt")
     self.simpleTriGram.merge(NGram("simple3b.txt"))
     self.simpleTriGram.merge(NGram("simple3c.txt"))
     self.assertEqual(20, self.simpleTriGram.vocabularySize())

Exemplo n.º 8

0

Exibir arquivo

Arquivo: NGramDeasciifierTest.py Projeto: StarlangSoftware/TurkishDeasciifier-Cy

 def test_Deasciify2(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, False)
     self.assertEqual("noter hakkında", nGramDeasciifier.deasciify(Sentence("noter hakkinda")).__str__())
     self.assertEqual("sandık medrese", nGramDeasciifier.deasciify(Sentence("sandik medrese")).__str__())
     self.assertEqual("kuran'ı karşılıklı", nGramDeasciifier.deasciify(Sentence("kuran'ı karsilikli")).__str__())

Exemplo n.º 9

0

Exibir arquivo

Arquivo: NGramDeasciifierTest.py Projeto: StarlangSoftware/TurkishDeasciifier-Cy

 def test_Deasciify(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, True)
     simpleAsciifier = SimpleAsciifier()
     corpus = Corpus("../corpus.txt")
     for i in range(corpus.sentenceCount()):
         sentence = corpus.getSentence(i)
         for j in range(1, sentence.wordCount()):
             if fsm.morphologicalAnalysis(sentence.getWord(j).getName()).size() > 0:
                 asciified = simpleAsciifier.asciifyWord(sentence.getWord(j))
                 if asciified != sentence.getWord(j).getName():
                     deasciified = nGramDeasciifier.deasciify(Sentence(sentence.getWord(j - 1).getName() + " " + sentence.getWord(j).getName()))
                     self.assertEqual(sentence.getWord(j).getName(), deasciified.getWord(1).getName())

Exemplo n.º 10

0

Exibir arquivo

 def test_SpellCheckSurfaceForm(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram, False)
     self.assertEqual(
         "noter hakkında",
         nGramSpellChecker.spellCheck(Sentence("noter hakkınad")).__str__())
     self.assertEqual(
         "arçelik'in çamaşır",
         nGramSpellChecker.spellCheck(
             Sentence("arçelik'in çamşaır")).__str__())
     self.assertEqual(
         "ruhsat yanında",
         nGramSpellChecker.spellCheck(Sentence("ruhset yanında")).__str__())

Exemplo n.º 11

0

Exibir arquivo

Arquivo: AdditiveSmoothing.py Projeto: yazanshehab/NGram-Py

    def learnParameters(self, corpus: list, N: int):
        """
        Wrapper function to learn the parameter (delta) in additive smoothing. The function first creates K NGrams
        with the train folds of the corpus. Then optimizes delta with respect to the test folds of the corpus.

        PARAMETERS
        ----------
        corpus : list
            Train corpus used to optimize delta parameter
        N : int
            N in N-Gram.
        """
        K = 10
        nGrams = []
        kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0)
        for i in range(K):
            nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i)))
        self.__delta = self.__learnBestDelta(nGrams, kFoldCrossValidation, 0.1)

Exemplo n.º 12

0

Exibir arquivo

Arquivo: SimpleSmoothingTest.py Projeto: yazanshehab/NGram-Py

 def setUp(self) -> None:
     self.simpleCorpus = [
         ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"],
         ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"],
         ["<s>", "ayşe", "kitabı", "ver", "</s>"],
         ["<s>", "ali", "topu", "mehmete", "at", "</s>"],
         ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"]
     ]
     self.simpleUniGram = NGram(1, self.simpleCorpus)
     self.simpleBiGram = NGram(2, self.simpleCorpus)
     self.simpleTriGram = NGram(3, self.simpleCorpus)
     self.trainCorpus = self.readCorpus("../train.txt")
     self.complexUniGram = NGram(1, self.trainCorpus)
     self.complexBiGram = NGram(2, self.trainCorpus)
     self.complexTriGram = NGram(3, self.trainCorpus)
     self.testCorpus = self.readCorpus("../test.txt")
     self.validationCorpus = self.readCorpus("../validation.txt")

Exemplo n.º 13

0

Exibir arquivo

Arquivo: NGramTest.py Projeto: sinazam1997/NGram-Py

 def test_VocabularySizeComplex(self):
     self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0)
     self.complexUniGram = NGram(1, self.testCorpus)
     self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0)
     self.complexUniGram = NGram(1, self.validationCorpus)
     self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0)

Exemplo n.º 14

0

Exibir arquivo

Arquivo: NaiveDisambiguation.py Projeto: salihercan/TurkishMorphologicalDisambiguation-Py

 def loadModel(self):
     """
     The loadModel method reads objects at the words1.txt and igs1.txt to the wordUniGramModel and igUniGramModel.
     """
     self.wordUniGramModel = NGram("words1.txt")
     self.igUniGramModel = NGram("igs1.txt")