Exemplo n.º 1
0
 def loadModel(self):
     """
     Method to load unigrams and bigrams.
     """
     super().loadModel()
     self.wordBiGramModel = NGram("words2.txt")
     self.igBiGramModel = NGram("igs2.txt")
Exemplo n.º 2
0
 def test_SpellCheck(self):
     original = [Sentence("demokratik cumhuriyet en kıymetli varlığımızdır"),
             Sentence("bu tablodaki değerler zedelenmeyecektir"),
             Sentence("milliyet'in geleneksel yılın sporcusu anketi 43. yaşını doldurdu"),
             Sentence("demokrasinin icadı bu ayrımı bulandırdı"),
             Sentence("dışişleri müsteşarı Öymen'in 1997'nin ilk aylarında Bağdat'a gitmesi öngörülüyor"),
             Sentence("büyüdü , palazlandı , devleti ele geçirdi"),
             Sentence("her maskenin ciltte kalma süresi farklıdır"),
             Sentence("yılın son ayında 10 gazeteci gözaltına alındı"),
             Sentence("iki pilotun kullandığı uçakta bir hostes görev alıyor"),
             Sentence("son derece kısıtlı kelimeler çerçevesinde kendilerini uzun cümlelerle ifade edebiliyorlar")]
     modified = [Sentence("demokratik cumhüriyet en kımetli varlıgımızdır"),
             Sentence("bu tblodaki değerlğr zedelenmeyecüktir"),
             Sentence("milliyet'in geeneksel yılın spoşcusu ankşti 43. yeşını doldürdu"),
             Sentence("demokrasinin icşdı bu ayrmıı bulandürdı"),
             Sentence("dışişleri mütseşarı Öymen'in 1997'nin ilk aylğrında Bağdat'a gitmesi öngşrülüyor"),
             Sentence("büyüdü , palazandı , devltei ele geçridi"),
             Sentence("her makenin cültte kalma sürdsi farlkıdır"),
             Sentence("yılın sno ayında 10 gazteci gözlatına alündı"),
             Sentence("iki piotun kulçandığı uçkata üir hotes görçv alyıor"),
             Sentence("son deece kısütlı keilmeler çeçevesinde kendülerini uzuü cümllerle ifüde edbeiliyorlar")]
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram)
     for i in range(len(modified)):
         self.assertEqual(original[i].toString(), nGramSpellChecker.spellCheck(modified[i]).toString())
 def test_Deasciify2(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, False)
     self.assertEqual("noter hakkında", nGramDeasciifier.deasciify(Sentence("noter hakkinda")).__str__())
     self.assertEqual("sandık medrese", nGramDeasciifier.deasciify(Sentence("sandik medrese")).__str__())
     self.assertEqual("kuran'ı karşılıklı", nGramDeasciifier.deasciify(Sentence("kuran'ı karsilikli")).__str__())
Exemplo n.º 4
0
    def setProbabilities(self, nGram: NGram, level: int):
        """
        Wrapper function to set the N-gram probabilities with laplace smoothing.

        PARAMETERS
        ----------
        nGram : NGram
            N-Gram for which the probabilities will be set.
        level : int
            height for NGram. if level = 1, If level = 1, N-Gram is treated as UniGram, if level = 2, N-Gram is treated
            as Bigram, etc.
        """
        nGram.setProbabilityWithPseudoCount(self.__delta, level)
Exemplo n.º 5
0
    def setProbabilities(self, nGram: NGram, level: int):
        """
        Wrapper function to set the N-gram probabilities with additive smoothing.

        PARAMETERS
        ----------
        nGram : NGram
            N-Gram for which the probabilities will be set.
        level : int
            Level for which N-Gram probabilities will be set. Probabilities for different levels of the N-gram can be
            set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2, N-Gram is treated as
            Bigram, etc.
        """
        nGram.setProbabilityWithPseudoCount(self.__delta, level)
    def setProbabilities(self, nGram: NGram, level: int):
        """
        Wrapper function to set the N-gram probabilities with no smoothing and replacing unknown words not found in
        set the dictionary.

        PARAMETERS
        ----------
        nGram : NGram
            N-Gram for which the probabilities will be set.
        level : int
            Level for which N-Gram probabilities will be set. Probabilities for different levels of the N-gram can be set
            with this function. If level = 1, N-Gram is treated as UniGram, if level = 2, N-Gram is treated as Bigram, etc.
        """
        nGram.replaceUnknownWords(self.__dictionary)
        super().setProbabilities(nGram, level)
Exemplo n.º 7
0
    def learnParameters(self, corpus: list, N: int):
        """
        Wrapper function to learn the parameters (lambda1 and lambda2) in interpolated smoothing. The function first
        creates K NGrams with the train folds of the corpus. Then optimizes lambdas with respect to the test folds of
        the corpus depending on given N.

        PARAMETERS
        ----------
        corpus : list
            Train corpus used to optimize lambda parameters
        N : int
            N in N-Gram.
        """
        if N <= 1:
            return
        K = 10
        nGrams = []
        kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0)
        for i in range(K):
            nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i)))
            for j in range(2, N + 1):
                nGrams[i].calculateNGramProbabilitiesSimpleLevel(
                    self.__simpleSmoothing, j)
            nGrams[i].calculateNGramProbabilitiesSimpleLevel(
                self.__simpleSmoothing, 1)
        if N == 2:
            self.__lambda1 = self.__learnBestLambda(nGrams,
                                                    kFoldCrossValidation, 0.1)
        elif N == 3:
            (self.__lambda1,
             self.__lambda2) = self.__learnBestLambdas(nGrams,
                                                       kFoldCrossValidation,
                                                       0.1, 0.1)
 def test_Deasciify(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt", "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramDeasciifier = NGramDeasciifier(fsm, nGram, True)
     simpleAsciifier = SimpleAsciifier()
     corpus = Corpus("../corpus.txt")
     for i in range(corpus.sentenceCount()):
         sentence = corpus.getSentence(i)
         for j in range(1, sentence.wordCount()):
             if fsm.morphologicalAnalysis(sentence.getWord(j).getName()).size() > 0:
                 asciified = simpleAsciifier.asciifyWord(sentence.getWord(j))
                 if asciified != sentence.getWord(j).getName():
                     deasciified = nGramDeasciifier.deasciify(Sentence(sentence.getWord(j - 1).getName() + " " + sentence.getWord(j).getName()))
                     self.assertEqual(sentence.getWord(j).getName(), deasciified.getWord(1).getName())
Exemplo n.º 9
0
 def test_SpellCheckSurfaceForm(self):
     fsm = FsmMorphologicalAnalyzer("../turkish_dictionary.txt",
                                    "../turkish_misspellings.txt",
                                    "../turkish_finite_state_machine.xml")
     nGram = NGram("../ngram.txt")
     nGram.calculateNGramProbabilitiesSimple(NoSmoothing())
     nGramSpellChecker = NGramSpellChecker(fsm, nGram, False)
     self.assertEqual(
         "noter hakkında",
         nGramSpellChecker.spellCheck(Sentence("noter hakkınad")).__str__())
     self.assertEqual(
         "arçelik'in çamaşır",
         nGramSpellChecker.spellCheck(
             Sentence("arçelik'in çamşaır")).__str__())
     self.assertEqual(
         "ruhsat yanında",
         nGramSpellChecker.spellCheck(Sentence("ruhset yanında")).__str__())
Exemplo n.º 10
0
    def train(self, corpus: DisambiguationCorpus):
        """
        The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and
        igBiGramModel. It gets the sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the
        word together with its part of speech tags to the wordUniGramModel. It also gets the transition list of that
        word and adds it to the igUniGramModel.

        If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the
        wordBiGramModel with their part of speech tags. It also adds them to the igBiGramModel with their transition
        lists.

        At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing,
        and both word and ig bigram models by using InterpolatedSmoothing.

        PARAMETERS
        ----------
        corpus : DisambiguationCorpus
            DisambiguationCorpus to train.
        """
        words1 = [None]
        igs1 = [None]
        words2 = [None, None]
        igs2 = [None, None]
        self.wordUniGramModel = NGram(1)
        self.wordBiGramModel = NGram(2)
        self.igUniGramModel = NGram(1)
        self.igBiGramModel = NGram(2)
        for sentence in corpus.sentences:
            for j in range(sentence.wordCount()):
                word = sentence.getWord(j)
                if isinstance(word, DisambiguatedWord):
                    words1[0] = word.getParse().getWordWithPos()
                    self.wordUniGramModel.addNGram(words1)
                    igs1[0] = Word(word.getParse().getTransitionList())
                    self.igUniGramModel.addNGram(igs1)
                    if j + 1 < sentence.wordCount():
                        words2[0] = words1[0]
                        words2[1] = sentence.getWord(
                            j + 1).getParse().getWordWithPos()
                        self.wordBiGramModel.addNGram(words2)
                        igs2[0] = igs1[0]
                        igs2[1] = Word(
                            sentence.getWord(j +
                                             1).getParse().getTransitionList())
                        self.igBiGramModel.addNGram(igs2)
        self.wordUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.wordBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
Exemplo n.º 11
0
    def setProbabilities(self, nGram: NGram, level: int):
        """
        Wrapper function to set the N-gram probabilities with Good-Turing smoothing. N[1] / sum_{i=1}^infinity N_i is
        the out of vocabulary probability.

        PARAMETERS
        ----------
        nGram : NGram
            N-Gram for which the probabilities will be set.
        level : int
            Level for which N-Gram probabilities will be set. Probabilities for different levels of the N-gram can be
            set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2, N-Gram is treated as
            Bigram, etc.
        """
        countsOfCounts = nGram.calculateCountsOfCounts(level)
        N = self.__linearRegressionOnCountsOfCounts(countsOfCounts)
        total = 0.0
        for r in range(1, len(countsOfCounts)):
            total += countsOfCounts[r] * r
        nGram.setAdjustedProbability(N, level, N[1] / total)
Exemplo n.º 12
0
    def train(self, corpus: DisambiguationCorpus):
        """
        The train method gets sentences from given DisambiguationCorpus and both word and the next word of that sentence
        at each iteration. Then, adds these words together with their part of speech tags to word unigram and bigram
        models. It also adds the last inflectional group of word to the ig unigram and bigram models.

        At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing,
        and both word and ig bigram models by using InterpolatedSmoothing.

        PARAMETERS
        ----------
        corpus : DisambiguationCorpus
            DisambiguationCorpus to train.
        """
        words1 = [None]
        igs1 = [None]
        words2 = [None, None]
        igs2 = [None, None]
        self.wordUniGramModel = NGram(1)
        self.igUniGramModel = NGram(1)
        self.wordBiGramModel = NGram(2)
        self.igBiGramModel = NGram(2)
        for sentence in corpus.sentences:
            for j in range(sentence.wordCount() - 1):
                word = sentence.getWord(j)
                nextWord = sentence.getWord(j + 1)
                words2[0] = word.getParse().getWordWithPos()
                words1[0] = words2[0]
                words2[1] = nextWord.getParse().getWordWithPos()
                self.wordUniGramModel.addNGram(words1)
                self.wordBiGramModel.addNGram(words2)
                for k in range(nextWord.getParse().size()):
                    igs2[0] = Word(
                        word.getParse().getLastInflectionalGroup().__str__())
                    igs2[1] = Word(
                        nextWord.getParse().getInflectionalGroup(k).__str__())
                    self.igBiGramModel.addNGram(igs2)
                    igs1[0] = igs2[1]
                    self.igUniGramModel.addNGram(igs1)
        self.wordUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.wordBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
Exemplo n.º 13
0
    def learnParameters(self, corpus: list, N: int):
        """
        Wrapper function to learn the parameter (delta) in additive smoothing. The function first creates K NGrams
        with the train folds of the corpus. Then optimizes delta with respect to the test folds of the corpus.

        PARAMETERS
        ----------
        corpus : list
            Train corpus used to optimize delta parameter
        N : int
            N in N-Gram.
        """
        K = 10
        nGrams = []
        kFoldCrossValidation = KFoldCrossValidation(corpus, K, 0)
        for i in range(K):
            nGrams.append(NGram(N, kFoldCrossValidation.getTrainFold(i)))
        self.__delta = self.__learnBestDelta(nGrams, kFoldCrossValidation, 0.1)
Exemplo n.º 14
0
 def test_LoadMultiPart(self):
     self.simpleUniGram = NGram(1)
     self.simpleUniGram.initWithMultipleFile("simple1part1.txt",
                                             "simple1part2.txt")
     self.simpleBiGram = NGram(2)
     self.simpleBiGram.initWithMultipleFile("simple2part1.txt",
                                            "simple2part2.txt",
                                            "simple2part3.txt")
     self.simpleTriGram = NGram(3)
     self.simpleTriGram.initWithMultipleFile("simple3part1.txt",
                                             "simple3part2.txt",
                                             "simple3part3.txt",
                                             "simple3part4.txt")
     self.test_GetCountSimple()
     self.test_VocabularySizeSimple()
Exemplo n.º 15
0
 def setUp(self) -> None:
     self.simpleCorpus = [
         ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"],
         ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"],
         ["<s>", "ayşe", "kitabı", "ver", "</s>"],
         ["<s>", "ali", "topu", "mehmete", "at", "</s>"],
         ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"]
     ]
     self.simpleUniGram = NGram(1, self.simpleCorpus)
     self.simpleBiGram = NGram(2, self.simpleCorpus)
     self.simpleTriGram = NGram(3, self.simpleCorpus)
     self.trainCorpus = self.readCorpus("../train.txt")
     self.complexUniGram = NGram(1, self.trainCorpus)
     self.complexBiGram = NGram(2, self.trainCorpus)
     self.complexTriGram = NGram(3, self.trainCorpus)
     self.testCorpus = self.readCorpus("../test.txt")
     self.validationCorpus = self.readCorpus("../validation.txt")
 def loadModel(self):
     """
     The loadModel method reads objects at the words1.txt and igs1.txt to the wordUniGramModel and igUniGramModel.
     """
     self.wordUniGramModel = NGram("words1.txt")
     self.igUniGramModel = NGram("igs1.txt")
Exemplo n.º 17
0
class HmmDisambiguation(NaiveDisambiguation):

    wordBiGramModel: NGram
    igBiGramModel: NGram

    def train(self, corpus: DisambiguationCorpus):
        """
        The train method gets sentences from given DisambiguationCorpus and both word and the next word of that sentence
        at each iteration. Then, adds these words together with their part of speech tags to word unigram and bigram
        models. It also adds the last inflectional group of word to the ig unigram and bigram models.

        At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing,
        and both word and ig bigram models by using InterpolatedSmoothing.

        PARAMETERS
        ----------
        corpus : DisambiguationCorpus
            DisambiguationCorpus to train.
        """
        words1 = [None]
        igs1 = [None]
        words2 = [None, None]
        igs2 = [None, None]
        self.wordUniGramModel = NGram(1)
        self.igUniGramModel = NGram(1)
        self.wordBiGramModel = NGram(2)
        self.igBiGramModel = NGram(2)
        for sentence in corpus.sentences:
            for j in range(sentence.wordCount() - 1):
                word = sentence.getWord(j)
                nextWord = sentence.getWord(j + 1)
                words2[0] = word.getParse().getWordWithPos()
                words1[0] = words2[0]
                words2[1] = nextWord.getParse().getWordWithPos()
                self.wordUniGramModel.addNGram(words1)
                self.wordBiGramModel.addNGram(words2)
                for k in range(nextWord.getParse().size()):
                    igs2[0] = Word(
                        word.getParse().getLastInflectionalGroup().__str__())
                    igs2[1] = Word(
                        nextWord.getParse().getInflectionalGroup(k).__str__())
                    self.igBiGramModel.addNGram(igs2)
                    igs1[0] = igs2[1]
                    self.igUniGramModel.addNGram(igs1)
        self.wordUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.wordBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())

    def disambiguate(self, fsmParses: list) -> list:
        """
        The disambiguate method takes FsmParseList as an input and gets one word with its part of speech tags, then gets
        its probability from word unigram model. It also gets ig and its probability. Then, hold the logarithmic value
        of the product of these probabilities in an array. Also by taking into consideration the parses of these word it
        recalculates the probabilities and returns these parses.

        PARAMETERS
        ----------
        fsmParses : list
            FsmParseList to disambiguate.

        RETURNS
        -------
        list
            List of FsmParses.
        """
        if len(fsmParses) == 0:
            return None
        for i in range(len(fsmParses)):
            if fsmParses[i].size() == 0:
                return None
        correctFsmParses = []
        probabilities = [[0.0 for _ in range(fsmParses[i].size())]
                         for i in range(len(fsmParses))]
        best = [[0 for _ in range(fsmParses[i].size())]
                for i in range(len(fsmParses))]
        for i in range(fsmParses[0].size()):
            currentParse = fsmParses[0].getFsmParse(i)
            if isinstance(currentParse, FsmParse):
                w1 = currentParse.getWordWithPos()
                probability = self.wordUniGramModel.getProbability(w1)
                for j in range(currentParse.size()):
                    ig1 = Word(currentParse.getInflectionalGroup(j).__str__())
                    probability *= self.igUniGramModel.getProbability(ig1)
                probabilities[0][i] = math.log(probability)
        for i in range(1, len(fsmParses)):
            for j in range(fsmParses[i].size()):
                bestProbability = -10000
                bestIndex = -1
                currentParse = fsmParses[i].getFsmParse(j)
                if isinstance(currentParse, FsmParse):
                    for k in range(fsmParses[i - 1].size()):
                        previousParse = fsmParses[i - 1].getFsmParse(k)
                        w1 = previousParse.getWordWithPos()
                        w2 = currentParse.getWordWithPos()
                        probability = probabilities[i - 1][k] + math.log(
                            self.wordBiGramModel.getProbability(w1, w2))
                        for t in range(fsmParses[i].getFsmParse(j).size()):
                            ig1 = Word(previousParse.lastInflectionalGroup().
                                       __str__())
                            ig2 = Word(
                                currentParse.getInflectionalGroup(t).__str__())
                            probability += math.log(
                                self.igBiGramModel.getProbability(ig1, ig2))
                        if probability > bestProbability:
                            bestIndex = k
                            bestProbability = probability
                probabilities[i][j] = bestProbability
                best[i][j] = bestIndex
        bestProbability = -10000
        bestIndex = -1
        for i in range(fsmParses[len(fsmParses) - 1].size()):
            if probabilities[len(fsmParses) - 1][i] > bestProbability:
                bestProbability = probabilities[len(fsmParses) - 1][i]
                bestIndex = i
        if bestIndex == -1:
            return None
        correctFsmParses.append(fsmParses[len(fsmParses) -
                                          1].getFsmParse(bestIndex))
        for i in range(len(fsmParses) - 2, -1, -1):
            bestIndex = best[i + 1][bestIndex]
            if bestIndex == -1:
                return None
            correctFsmParses.insert(0, fsmParses[i].getFsmParse(bestIndex))
        return correctFsmParses

    def saveModel(self):
        """
        Method to save unigrams and bigrams.
        """
        super().saveModel()
        self.wordBiGramModel.saveAsText("words2.txt")
        self.igBiGramModel.saveAsText("igs2.txt")

    def loadModel(self):
        """
        Method to load unigrams and bigrams.
        """
        super().loadModel()
        self.wordBiGramModel = NGram("words2.txt")
        self.igBiGramModel = NGram("igs2.txt")
Exemplo n.º 18
0
class MyTestCase(CorpusTest, unittest.TestCase):

    simpleUniGram: NGram
    simpleBiGram: NGram
    simpleTriGram: NGram
    complexUniGram: NGram
    complexBiGram: NGram
    complexTriGram: NGram
    simpleCorpus: list
    trainCorpus: list
    testCorpus: list
    validationCorpus: list

    def setUp(self) -> None:
        self.simpleCorpus = [
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"],
            ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"],
            ["<s>", "ayşe", "kitabı", "ver", "</s>"],
            ["<s>", "ali", "topu", "mehmete", "at", "</s>"],
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"]
        ]
        self.simpleUniGram = NGram(1, self.simpleCorpus)
        self.simpleBiGram = NGram(2, self.simpleCorpus)
        self.simpleTriGram = NGram(3, self.simpleCorpus)
        self.trainCorpus = self.readCorpus("../train.txt")
        self.complexUniGram = NGram(1, self.trainCorpus)
        self.complexBiGram = NGram(2, self.trainCorpus)
        self.complexTriGram = NGram(3, self.trainCorpus)
        self.testCorpus = self.readCorpus("../test.txt")
        self.validationCorpus = self.readCorpus("../validation.txt")

    def test_GetCountSimple(self):
        self.assertEqual(5, self.simpleUniGram.getCount(["<s>"]))
        self.assertEqual(0, self.simpleUniGram.getCount(["mahmut"]), 0.0)
        self.assertEqual(1, self.simpleUniGram.getCount(["kitabı"]), 0.0)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["ayşe", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["mahmut", "ali"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["at", "mehmet"]), 0.0)
        self.assertEqual(1, self.simpleTriGram.getCount(["<s>", "ali", "top"]),
                         0.0)
        self.assertEqual(0,
                         self.simpleTriGram.getCount(["ayşe", "kitabı", "at"]),
                         0.0)
        self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "topu",
                                                         "at"]), 0.0)
        self.assertEqual(
            0, self.simpleTriGram.getCount(["mahmut", "evde", "kal"]), 0.0)
        self.assertEqual(2, self.simpleTriGram.getCount(["ali", "topu", "at"]),
                         0.0)

    def test_GetCountComplex(self):
        self.assertEqual(20000, self.complexUniGram.getCount(["<s>"]), 0.0)
        self.assertEqual(50, self.complexUniGram.getCount(["atatürk"]), 0.0)
        self.assertEqual(11, self.complexBiGram.getCount(["<s>", "mustafa"]),
                         0.0)
        self.assertEqual(3, self.complexBiGram.getCount(["mustafa", "kemal"]),
                         0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["<s>", "mustafa", "kemal"]), 0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["mustafa", "kemal", "atatürk"]),
            0.0)

    def test_VocabularySizeSimple(self):
        self.assertEqual(15, self.simpleUniGram.vocabularySize())

    def test_VocabularySizeComplex(self):
        self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.testCorpus)
        self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.validationCorpus)
        self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0)

    def test_SaveAsText(self):
        self.simpleUniGram.saveAsText("simple1.txt")
        self.simpleBiGram.saveAsText("simple2.txt")
        self.simpleTriGram.saveAsText("simple3.txt")
Exemplo n.º 19
0
 def test_VocabularySizeComplex(self):
     self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0)
     self.complexUniGram = NGram(1, self.testCorpus)
     self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0)
     self.complexUniGram = NGram(1, self.validationCorpus)
     self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0)
Exemplo n.º 20
0
class RootFirstDisambiguation(NaiveDisambiguation):

    wordBiGramModel: NGram
    igBiGramModel: NGram

    def train(self, corpus: DisambiguationCorpus):
        """
        The train method initially creates new NGrams; wordUniGramModel, wordBiGramModel, igUniGramModel, and
        igBiGramModel. It gets the sentences from given corpus and gets each word as a DisambiguatedWord. Then, adds the
        word together with its part of speech tags to the wordUniGramModel. It also gets the transition list of that
        word and adds it to the igUniGramModel.

        If there exists a next word in the sentence, it adds the current and next {@link DisambiguatedWord} to the
        wordBiGramModel with their part of speech tags. It also adds them to the igBiGramModel with their transition
        lists.

        At the end, it calculates the NGram probabilities of both word and ig unigram models by using LaplaceSmoothing,
        and both word and ig bigram models by using InterpolatedSmoothing.

        PARAMETERS
        ----------
        corpus : DisambiguationCorpus
            DisambiguationCorpus to train.
        """
        words1 = [None]
        igs1 = [None]
        words2 = [None, None]
        igs2 = [None, None]
        self.wordUniGramModel = NGram(1)
        self.wordBiGramModel = NGram(2)
        self.igUniGramModel = NGram(1)
        self.igBiGramModel = NGram(2)
        for sentence in corpus.sentences:
            for j in range(sentence.wordCount()):
                word = sentence.getWord(j)
                if isinstance(word, DisambiguatedWord):
                    words1[0] = word.getParse().getWordWithPos()
                    self.wordUniGramModel.addNGram(words1)
                    igs1[0] = Word(word.getParse().getTransitionList())
                    self.igUniGramModel.addNGram(igs1)
                    if j + 1 < sentence.wordCount():
                        words2[0] = words1[0]
                        words2[1] = sentence.getWord(
                            j + 1).getParse().getWordWithPos()
                        self.wordBiGramModel.addNGram(words2)
                        igs2[0] = igs1[0]
                        igs2[1] = Word(
                            sentence.getWord(j +
                                             1).getParse().getTransitionList())
                        self.igBiGramModel.addNGram(igs2)
        self.wordUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igUniGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.wordBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())
        self.igBiGramModel.calculateNGramProbabilitiesSimple(
            LaplaceSmoothing())

    def getWordProbability(self, word: Word, correctFsmParses: list,
                           index: int) -> float:
        """
        The getWordProbability method returns the probability of a word by using word bigram or unigram model.

        PARAMETERS
        ----------
        word : Word
            Word to find the probability.
        correctFsmParses : list
            FsmParse of given word which will be used for getting part of speech tags.
        index : int
            Index of FsmParse of which part of speech tag will be used to get the probability.

        RETURNS
        -------
        float
            The probability of the given word.
        """
        if index != 0 and len(correctFsmParses) == index:
            return self.wordBiGramModel.getProbability(
                correctFsmParses[index - 1].getWordWithPos(), word)
        else:
            return self.wordUniGramModel.getProbability(word)

    def getIgProbability(self, word: Word, correctFsmParses: list,
                         index: int) -> float:
        """
        The getIgProbability method returns the probability of a word by using ig bigram or unigram model.

        PARAMETERS
        ----------
        word : Word
            Word to find the probability.
        correctFsmParses : list
            FsmParse of given word which will be used for getting transition list.
        index : int
            Index of FsmParse of which transition list will be used to get the probability.

        RETURNS
        -------
        float
            The probability of the given word.
        """
        if index != 0 and len(correctFsmParses) == index:
            return self.igBiGramModel.getProbability(
                Word(correctFsmParses[index - 1].getTransitionList()), word)
        else:
            return self.igUniGramModel.getProbability(word)

    def getBestRootWord(self, fsmParseList: FsmParseList) -> Word:
        """
        The getBestRootWord method takes a FsmParseList as an input and loops through the list. It gets each word with
        its part of speech tags as a new Word word and its transition list as a Word ig. Then, finds their corresponding
        probabilities. At the end returns the word with the highest probability.

        PARAMETERS
        ----------
        fsmParseList : FsmParseList
            FsmParseList is used to get the part of speech tags and transition lists of words.

        RETURNS
        -------
        Word
            The word with the highest probability.
        """
        bestProbability = -1
        bestWord = None
        for j in range(fsmParseList.size()):
            word = fsmParseList.getFsmParse(j).getWordWithPos()
            ig = Word(fsmParseList.getFsmParse(j).getTransitionList())
            wordProbability = self.wordUniGramModel.getProbability(word)
            igProbability = self.igUniGramModel.getProbability(ig)
            probability = wordProbability * igProbability
            if probability > bestProbability:
                bestWord = word
                bestProbability = probability
        return bestWord

    def getParseWithBestIgProbability(self, parseList: FsmParseList,
                                      correctFsmParses: list,
                                      index: int) -> FsmParse:
        """
        The getParseWithBestIgProbability gets each FsmParse's transition list as a Word ig. Then, finds the
        corresponding probability. At the end returns the parse with the highest ig probability.

        PARAMETERS
        ----------
        parseList : FsmParseList
            FsmParseList is used to get the FsmParse.
        correctFsmParses : list
            FsmParse is used to get the transition lists.
        index : int
            Index of FsmParse of which transition list will be used to get the probability.

        RETURNS
        -------
        FsmParse
            The parse with the highest probability.
        """
        bestParse = None
        bestProbability = -1
        for j in range(parseList.size()):
            ig = Word(parseList.getFsmParse(j).getTransitionList())
            probability = self.getIgProbability(ig, correctFsmParses, index)
            if probability > bestProbability:
                bestParse = parseList.getFsmParse(j)
                bestProbability = probability
        return bestParse

    def disambiguate(self, fsmParses: list) -> list:
        """
        The disambiguate method gets an array of fsmParses. Then loops through that parses and finds the most probable
        root word and removes the other words which are identical to the most probable root word. At the end, gets the
        most probable parse among the fsmParses and adds it to the correctFsmParses list.

        PARAMETERS
        ----------
        fsmParses : list
            FsmParseList to disambiguate.

        RETURNS
        -------
        list
            CcorrectFsmParses list which holds the most probable parses.
        """
        correctFsmParses = []
        for i in range(len(fsmParses)):
            bestWord = self.getBestRootWord(fsmParses[i])
            fsmParses[i].reduceToParsesWithSameRootAndPos(bestWord)
            bestParse = self.getParseWithBestIgProbability(
                fsmParses[i], correctFsmParses, i)
            if bestParse is not None:
                correctFsmParses.append(bestParse)
        return correctFsmParses

    def saveModel(self):
        """
        Method to save unigrams and bigrams.
        """
        super().saveModel()
        self.wordBiGramModel.saveAsText("words2.txt")
        self.igBiGramModel.saveAsText("igs2.txt")

    def loadModel(self):
        """
        Method to load unigrams and bigrams.
        """
        super().loadModel()
        self.wordBiGramModel = NGram("words2.txt")
        self.igBiGramModel = NGram("igs2.txt")
Exemplo n.º 21
0
    def setProbabilities(self, nGram: NGram, level: int):
        """
        Wrapper function to set the N-gram probabilities with interpolated smoothing.

        PARAMETERS
        ----------
        nGram : NGram
            N-Gram for which the probabilities will be set.
        level : int
            Level for which N-Gram probabilities will be set. Probabilities for different levels of the N-gram can be
            set with this function. If level = 1, N-Gram is treated as UniGram, if level = 2, N-Gram is treated as
            Bigram, etc.
        """
        for j in range(2, nGram.getN() + 1):
            nGram.calculateNGramProbabilitiesSimpleLevel(
                self.__simpleSmoothing, j)
        nGram.calculateNGramProbabilitiesSimpleLevel(self.__simpleSmoothing, 1)
        if nGram.getN() == 2:
            nGram.setLambda2(self.__lambda1)
        elif nGram.getN() == 3:
            nGram.setLambda3(self.__lambda1, self.__lambda2)
Exemplo n.º 22
0
 def test_Merge(self):
     self.simpleUniGram = NGram("simple1a.txt")
     self.simpleUniGram.merge(NGram("simple1b.txt"))
     self.assertEqual(18, self.simpleUniGram.vocabularySize())
     self.simpleBiGram = NGram("simple2a.txt")
     self.simpleBiGram.merge(NGram("simple2b.txt"))
     self.simpleBiGram.merge(NGram("simple2c.txt"))
     self.simpleBiGram.merge(NGram("simple2d.txt"))
     self.assertEqual(21, self.simpleBiGram.vocabularySize())
     self.simpleTriGram = NGram("simple3a.txt")
     self.simpleTriGram.merge(NGram("simple3b.txt"))
     self.simpleTriGram.merge(NGram("simple3c.txt"))
     self.assertEqual(20, self.simpleTriGram.vocabularySize())
Exemplo n.º 23
0
class NGramTest(CorpusTest, unittest.TestCase):

    simpleUniGram: NGram
    simpleBiGram: NGram
    simpleTriGram: NGram
    complexUniGram: NGram
    complexBiGram: NGram
    complexTriGram: NGram
    simpleCorpus: list
    trainCorpus: list
    testCorpus: list
    validationCorpus: list

    def setUp(self) -> None:
        self.simpleCorpus = [
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeye", "gitti", "</s>"],
            ["<s>", "ali", "top", "at", "ayşe", "eve", "gitti", "</s>"],
            ["<s>", "ayşe", "kitabı", "ver", "</s>"],
            ["<s>", "ali", "topu", "mehmete", "at", "</s>"],
            ["<s>", "ali", "topu", "at", "mehmet", "ayşeyle", "gitti", "</s>"]
        ]
        self.simpleUniGram = NGram(1, self.simpleCorpus)
        self.simpleBiGram = NGram(2, self.simpleCorpus)
        self.simpleTriGram = NGram(3, self.simpleCorpus)
        self.trainCorpus = self.readCorpus("../train.txt")
        self.complexUniGram = NGram(1, self.trainCorpus)
        self.complexBiGram = NGram(2, self.trainCorpus)
        self.complexTriGram = NGram(3, self.trainCorpus)
        self.testCorpus = self.readCorpus("../test.txt")
        self.validationCorpus = self.readCorpus("../validation.txt")

    def test_GetCountSimple(self):
        self.assertEqual(5, self.simpleUniGram.getCount(["<s>"]))
        self.assertEqual(0, self.simpleUniGram.getCount(["mahmut"]), 0.0)
        self.assertEqual(1, self.simpleUniGram.getCount(["kitabı"]), 0.0)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["ayşe", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["mahmut", "ali"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["at", "mehmet"]), 0.0)
        self.assertEqual(1, self.simpleTriGram.getCount(["<s>", "ali", "top"]),
                         0.0)
        self.assertEqual(0,
                         self.simpleTriGram.getCount(["ayşe", "kitabı", "at"]),
                         0.0)
        self.assertEqual(0, self.simpleTriGram.getCount(["ayşe", "topu",
                                                         "at"]), 0.0)
        self.assertEqual(
            0, self.simpleTriGram.getCount(["mahmut", "evde", "kal"]), 0.0)
        self.assertEqual(2, self.simpleTriGram.getCount(["ali", "topu", "at"]),
                         0.0)

    def test_GetCountComplex(self):
        self.assertEqual(20000, self.complexUniGram.getCount(["<s>"]), 0.0)
        self.assertEqual(50, self.complexUniGram.getCount(["atatürk"]), 0.0)
        self.assertEqual(11, self.complexBiGram.getCount(["<s>", "mustafa"]),
                         0.0)
        self.assertEqual(3, self.complexBiGram.getCount(["mustafa", "kemal"]),
                         0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["<s>", "mustafa", "kemal"]), 0.0)
        self.assertEqual(
            1, self.complexTriGram.getCount(["mustafa", "kemal", "atatürk"]),
            0.0)

    def test_VocabularySizeSimple(self):
        self.assertEqual(15, self.simpleUniGram.vocabularySize())

    def test_VocabularySizeComplex(self):
        self.assertEqual(57625, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.testCorpus)
        self.assertEqual(55485, self.complexUniGram.vocabularySize(), 0.0)
        self.complexUniGram = NGram(1, self.validationCorpus)
        self.assertEqual(35663, self.complexUniGram.vocabularySize(), 0.0)

    def test_Prune(self):
        self.simpleBiGram.prune(0.0)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(1, self.simpleBiGram.getCount(["<s>", "ayşe"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.assertEqual(1, self.simpleBiGram.getCount(["ali", "top"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0)
        self.assertEqual(1, self.simpleBiGram.getCount(["topu", "mehmete"]),
                         0.0)
        self.simpleBiGram.prune(0.6)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["<s>", "ayşe"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["ali", "top"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0)
        self.assertEqual(0, self.simpleBiGram.getCount(["topu", "mehmete"]),
                         0.0)
        self.simpleBiGram.prune(0.7)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.assertEqual(2, self.simpleBiGram.getCount(["topu", "at"]), 0.0)
        self.simpleBiGram.prune(0.8)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)
        self.assertEqual(3, self.simpleBiGram.getCount(["ali", "topu"]), 0.0)
        self.simpleBiGram.prune(0.9)
        self.assertEqual(4, self.simpleBiGram.getCount(["<s>", "ali"]), 0.0)

    def test_SaveAsText(self):
        self.simpleUniGram.saveAsText("simple1.txt")
        self.simpleBiGram.saveAsText("simple2.txt")
        self.simpleTriGram.saveAsText("simple3.txt")

    def test_Merge(self):
        self.simpleUniGram = NGram("simple1a.txt")
        self.simpleUniGram.merge(NGram("simple1b.txt"))
        self.assertEqual(18, self.simpleUniGram.vocabularySize())
        self.simpleBiGram = NGram("simple2a.txt")
        self.simpleBiGram.merge(NGram("simple2b.txt"))
        self.simpleBiGram.merge(NGram("simple2c.txt"))
        self.simpleBiGram.merge(NGram("simple2d.txt"))
        self.assertEqual(21, self.simpleBiGram.vocabularySize())
        self.simpleTriGram = NGram("simple3a.txt")
        self.simpleTriGram.merge(NGram("simple3b.txt"))
        self.simpleTriGram.merge(NGram("simple3c.txt"))
        self.assertEqual(20, self.simpleTriGram.vocabularySize())

    def test_LoadMultiPart(self):
        self.simpleUniGram = NGram(1)
        self.simpleUniGram.initWithMultipleFile("simple1part1.txt",
                                                "simple1part2.txt")
        self.simpleBiGram = NGram(2)
        self.simpleBiGram.initWithMultipleFile("simple2part1.txt",
                                               "simple2part2.txt",
                                               "simple2part3.txt")
        self.simpleTriGram = NGram(3)
        self.simpleTriGram.initWithMultipleFile("simple3part1.txt",
                                                "simple3part2.txt",
                                                "simple3part3.txt",
                                                "simple3part4.txt")
        self.test_GetCountSimple()
        self.test_VocabularySizeSimple()
Exemplo n.º 24
0
 def setProbabilities(self, nGram: NGram, level: int):
     nGram.setProbabilityWithPseudoCount(0.0, level)