Exemplo n.º 1
0
    def calculateEmissionProbabilities(self, state: object, observations: list,
                                       emittedSymbols: list) -> dict:
        """
        calculateEmissionProbabilities calculates the emission probabilities for a specific state. The method takes the
        state, an array of observations (which also consists of an array of states) and an array of instances (which also
        consists of an array of emitted symbols).

        PARAMETERS
        ----------
        states : set
            A Set of states, consisting of all possible states for this problem.
        observations : list
            An array of instances, where each instance consists of an array of states.
        emittedSymbols : list
            An array of instances, where each instance consists of an array of symbols.

        RETURNS
        -------
        dict
            A HashMap. Emission probabilities for a single state. Contains a probability for each symbol emitted.
        """
        counts = CounterHashMap()
        emissionProbabilities = {}
        for i in range(len(observations)):
            for j in range(len(observations[i])):
                currentState = observations[i][j]
                currentSymbol = emittedSymbols[i][j]
                if currentState == state:
                    counts.put(currentSymbol)
        total = counts.sumOfCounts()
        for symbol in counts:
            emissionProbabilities[symbol] = counts[symbol] / total
        return emissionProbabilities
 def test_MaxThreshold2(self):
     counterHashMap = CounterHashMap()
     for i in range(1000000):
         counterHashMap.put(randrange(100).__str__())
     probability = counterHashMap.count(counterHashMap.max()) / 1000000.0
     self.assertIsNotNone(counterHashMap.max(probability - 0.001))
     self.assertIsNone(counterHashMap.max(probability + 0.001))
 def test_Put2(self):
     counterHashMap = CounterHashMap()
     for i in range(1000):
         counterHashMap.put(randrange(1000))
     count = 0
     for i in range(1000):
         count += counterHashMap.count(i)
     self.assertEquals(1000, count)
 def test_Add3(self):
     counterHashMap1 = CounterHashMap()
     for i in range(1000):
         counterHashMap1.put(i)
     counterHashMap2 = CounterHashMap()
     for i in range(500, 1000):
         counterHashMap2.putNTimes(1000 + i, i + 1)
     counterHashMap1.add(counterHashMap2)
     self.assertEquals(1500, len(counterHashMap1))
 def test_NERCorpus(self):
     counter = CounterHashMap()
     nerCorpus = NERCorpus("../nerdata.txt")
     self.assertEqual(27556, nerCorpus.sentenceCount())
     self.assertEqual(492233, nerCorpus.numberOfWords())
     for i in range(nerCorpus.sentenceCount()):
         namedEntitySentence = nerCorpus.getSentence(i)
         for j in range(namedEntitySentence.wordCount()):
             namedEntityWord = namedEntitySentence.getWord(j)
             counter.put(namedEntityWord.getNamedEntityType())
     self.assertEqual(438976, counter[NamedEntityType.NONE])
     self.assertEqual(23878, counter[NamedEntityType.PERSON])
     self.assertEqual(16931, counter[NamedEntityType.ORGANIZATION])
     self.assertEqual(12448, counter[NamedEntityType.LOCATION])
 def test_Add2(self):
     counterHashMap1 = CounterHashMap()
     counterHashMap1.put("item1")
     counterHashMap1.put("item2")
     counterHashMap1.put("item1")
     counterHashMap1.put("item2")
     counterHashMap1.put("item1")
     counterHashMap2 = CounterHashMap()
     counterHashMap2.put("item4")
     counterHashMap2.putNTimes("item5", 4)
     counterHashMap2.put("item2")
     counterHashMap1.add(counterHashMap2)
     self.assertEquals(3, counterHashMap1.count("item1"))
     self.assertEquals(3, counterHashMap1.count("item2"))
     self.assertEquals(1, counterHashMap1.count("item4"))
     self.assertEquals(4, counterHashMap1.count("item5"))
 def test_TransitionWith(self):
     transitionCounts = CounterHashMap()
     for state in self.stateList:
         transitions = self.fsm.getTransitions(state)
         for transition in transitions:
             transitionCounts.put(transition.__str__())
     topList = transitionCounts.topN(5)
     self.assertEqual("0", topList[0][1])
     self.assertEqual(111, topList[0][0])
     self.assertEqual("lAr", topList[1][1])
     self.assertEqual(37, topList[1][0])
     self.assertEqual("DHr", topList[2][1])
     self.assertEqual(28, topList[2][0])
     self.assertEqual("Hn", topList[3][1])
     self.assertEqual(24, topList[3][0])
     self.assertEqual("lArH", topList[4][1])
     self.assertEqual(23, topList[4][0])
 def test_TransitionWithName(self):
     transitionCounts = CounterHashMap()
     for state in self.stateList:
         transitions = self.fsm.getTransitions(state)
         for transition in transitions:
             transitionCounts.put(transition.withName())
     topList = transitionCounts.topN(5)
     self.assertEqual(None, topList[0][1])
     self.assertEqual(52, topList[0][0])
     self.assertEqual("^DB+VERB+CAUS", topList[1][1])
     self.assertEqual(33, topList[1][0])
     self.assertEqual("^DB+VERB+PASS", topList[2][1])
     self.assertEqual(31, topList[2][0])
     self.assertEqual("A3PL", topList[3][1])
     self.assertEqual(28, topList[3][0])
     self.assertEqual("LOC", topList[4][1])
     self.assertEqual(24, topList[4][0])
Exemplo n.º 9
0
    def getMaximum(classLabels: list) -> str:
        """
        Given an array of class labels, returns the maximum occurred one.

        PARAMETERS
        ----------
        classLabels : list
            An array of class labels.

        RETURNS
        -------
        str
            The class label that occurs most in the array of class labels (mod of class label list).
        """
        frequencies = CounterHashMap()
        for label in classLabels:
            frequencies.put(label)
        return frequencies.max()
    def classify(self, actualClass: str, predictedClass: str):
        """
        The classify method takes two Strings; actual class and predicted class as inputs. If the matrix dictionary
        contains given actual class String as a key, it then assigns the corresponding object of that key to a
        CounterHashMap, if not it creates a new CounterHashMap. Then, it puts the given predicted class String to the
        counterHashMap and also put this counterHashMap to the matrix dictionary together with the given actual class
        String.

        PARAMETERS
        ----------
        actualClass : str
            String input actual class.
        predictedClass : str
            String input predicted class.
        """
        if actualClass in self.__matrix:
            counterHashMap = self.__matrix[actualClass]
        else:
            counterHashMap = CounterHashMap()
        counterHashMap.put(predictedClass)
        self.__matrix[actualClass] = counterHashMap
 def test_StartEndStates(self):
     endStateCount = 0
     for state in self.stateList:
         if state.isEndState():
             endStateCount = endStateCount + 1
     self.assertEqual(35, endStateCount)
     posCounts = CounterHashMap()
     for state in self.stateList:
         posCounts.put(state.getPos())
     self.assertEqual(1, posCounts.get("HEAD"))
     self.assertEqual(6, posCounts.get("PRON"))
     self.assertEqual(1, posCounts.get("PROP"))
     self.assertEqual(8, posCounts.get("NUM"))
     self.assertEqual(7, posCounts.get("ADJ"))
     self.assertEqual(1, posCounts.get("INTERJ"))
     self.assertEqual(1, posCounts.get("DET"))
     self.assertEqual(1, posCounts.get("ADVERB"))
     self.assertEqual(1, posCounts.get("QUES"))
     self.assertEqual(1, posCounts.get("CONJ"))
     self.assertEqual(26, posCounts.get("VERB"))
     self.assertEqual(1, posCounts.get("POSTP"))
     self.assertEqual(1, posCounts.get("DUP"))
     self.assertEqual(11, posCounts.get("NOUN"))
 def test_Max(self):
     counterHashMap = CounterHashMap()
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item3")
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item1")
     self.assertEquals("item1", counterHashMap.max())
 def test_DependencyCorpus(self):
     relationCounts = CounterHashMap()
     corpus = TurkishDependencyTreeBankCorpus("../metu-treebank.xml")
     self.assertEqual(5635, corpus.sentenceCount())
     wordCount = 0
     for i in range(corpus.sentenceCount()):
         sentence = corpus.getSentence(i)
         wordCount += sentence.wordCount()
         for j in range(sentence.wordCount()):
             word = sentence.getWord(j)
             if word.getRelation() is not None:
                 relationCounts.put(word.getRelation().getTurkishDependencyType())
     self.assertEqual(11692, relationCounts.get(TurkishDependencyType.MODIFIER))
     self.assertEqual(903, relationCounts.get(TurkishDependencyType.INTENSIFIER))
     self.assertEqual(1142, relationCounts.get(TurkishDependencyType.LOCATIVE_ADJUNCT))
     self.assertEqual(240, relationCounts.get(TurkishDependencyType.VOCATIVE))
     self.assertEqual(7261, relationCounts.get(TurkishDependencyType.SENTENCE))
     self.assertEqual(16, relationCounts.get(TurkishDependencyType.EQU_ADJUNCT))
     self.assertEqual(159, relationCounts.get(TurkishDependencyType.NEGATIVE_PARTICLE))
     self.assertEqual(4481, relationCounts.get(TurkishDependencyType.SUBJECT))
     self.assertEqual(2476, relationCounts.get(TurkishDependencyType.COORDINATION))
     self.assertEqual(2050, relationCounts.get(TurkishDependencyType.CLASSIFIER))
     self.assertEqual(73, relationCounts.get(TurkishDependencyType.COLLOCATION))
     self.assertEqual(1516, relationCounts.get(TurkishDependencyType.POSSESSOR))
     self.assertEqual(523, relationCounts.get(TurkishDependencyType.ABLATIVE_ADJUNCT))
     self.assertEqual(23, relationCounts.get(TurkishDependencyType.FOCUS_PARTICLE))
     self.assertEqual(1952, relationCounts.get(TurkishDependencyType.DETERMINER))
     self.assertEqual(1361, relationCounts.get(TurkishDependencyType.DATIVE_ADJUNCT))
     self.assertEqual(202, relationCounts.get(TurkishDependencyType.APPOSITION))
     self.assertEqual(289, relationCounts.get(TurkishDependencyType.QUESTION_PARTICLE))
     self.assertEqual(597, relationCounts.get(TurkishDependencyType.S_MODIFIER))
     self.assertEqual(10, relationCounts.get(TurkishDependencyType.ETOL))
     self.assertEqual(8338, relationCounts.get(TurkishDependencyType.OBJECT))
     self.assertEqual(271, relationCounts.get(TurkishDependencyType.INSTRUMENTAL_ADJUNCT))
     self.assertEqual(85, relationCounts.get(TurkishDependencyType.RELATIVIZER))
     self.assertEqual(53993, wordCount)
 def test_MaxThreshold1(self):
     counterHashMap = CounterHashMap()
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item3")
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item1")
     self.assertEquals("item1", counterHashMap.max(0.4999))
     self.assertNotEquals("item1", counterHashMap.max(0.5001))
 def test_TopN1(self):
     counterHashMap = CounterHashMap()
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item3")
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item1")
     self.assertEquals("item1", counterHashMap.topN(1)[0][1])
     self.assertEquals("item2", counterHashMap.topN(2)[1][1])
     self.assertEquals("item3", counterHashMap.topN(3)[2][1])
Exemplo n.º 16
0
 def test_Put1(self):
     counterHashMap = CounterHashMap()
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item3")
     counterHashMap.put("item1")
     counterHashMap.put("item2")
     counterHashMap.put("item1")
     self.assertEquals(3, counterHashMap.count("item1"))
     self.assertEquals(2, counterHashMap.count("item2"))
     self.assertEquals(1, counterHashMap.count("item3"))
 def test_Add1(self):
     counterHashMap1 = CounterHashMap()
     counterHashMap1.put("item1")
     counterHashMap1.put("item2")
     counterHashMap1.put("item3")
     counterHashMap1.put("item1")
     counterHashMap1.put("item2")
     counterHashMap1.put("item1")
     counterHashMap2 = CounterHashMap()
     counterHashMap2.putNTimes("item1", 2)
     counterHashMap2.putNTimes("item2", 3)
     counterHashMap2.putNTimes("item3", 6)
     counterHashMap2.putNTimes("item1", 2)
     counterHashMap2.putNTimes("item2", 3)
     counterHashMap2.putNTimes("item1", 2)
     counterHashMap1.add(counterHashMap2)
     self.assertEquals(9, counterHashMap1.count("item1"))
     self.assertEquals(8, counterHashMap1.count("item2"))
     self.assertEquals(7, counterHashMap1.count("item3"))
 def test_SumOfCounts(self):
     counterHashMap = CounterHashMap()
     for i in range(1000):
         counterHashMap.put(randrange(1000))
     self.assertEquals(1000, counterHashMap.sumOfCounts())
Exemplo n.º 19
0
class Corpus:

    paragraphs: list
    sentences: list
    wordList: CounterHashMap
    fileName: str

    def __init__(self, fileName=None, splitterOrChecker=None):
        """
        Constructor of Corpus class which takes a file name as an input. Then reads the input file line by line
        and calls addSentence method with each read line.

        PARAMETERS
        ----------
        fileName : str
            String file name input that will be read.
        """
        self.sentences = []
        self.paragraphs = []
        self.wordList = CounterHashMap()
        if fileName is not None:
            self.fileName = fileName
            file = open(fileName, "r", encoding='utf8')
            lines = file.readlines()
            if splitterOrChecker is not None:
                if isinstance(splitterOrChecker, SentenceSplitter):
                    for line in lines:
                        sentences = splitterOrChecker.split(line.strip())
                        paragraph = Paragraph()
                        for sentence in sentences:
                            paragraph.addSentence(sentence)
                        self.addParagraph(paragraph)
                elif isinstance(splitterOrChecker, LanguageChecker):
                    for line in lines:
                        sentence = Sentence(line.strip(), splitterOrChecker)
                        self.addSentence(sentence)
            else:
                for line in lines:
                    self.addSentence(Sentence(line.strip()))

    def combine(self, corpus: Corpus):
        """
        The combine method takes a Corpus as an input and adds each sentence of sentences list.

        PARAMETERS
        ----------
        corpus : Corpus
            Corpus type input.
        """
        for sentence in corpus.sentences:
            self.addSentence(sentence)

    def addSentence(self, s: Sentence):
        """
        The addSentence method takes a Sentence as an input. It adds given input to sentences list and loops
        through the each word in sentence and puts these words into wordList CounterHashMap.

        PARAMETERS
        ----------
        s : Sentence
            Sentence type input that will be added to sentences list and its words will be added to wordList
            CounterHashMap.
        """
        self.sentences.append(s)
        for i in range(s.wordCount()):
            w = s.getWord(i)
            self.wordList.put(w)

    def numberOfWords(self) -> int:
        """
        The numberOfWords method loops through the sentences list and accumulates the number of words in sentence.

        RETURNS
        -------
        int
            size which holds the total number of words.
        """
        size = 0
        for s in self.sentences:
            size += s.wordCount()
        return size

    def contains(self, word: str) -> bool:
        """
        The contains method takes a String word as an input and checks whether wordList CounterHashMap has the
        given word and returns true if so, otherwise returns false.

        PARAMETERS
        ----------
        word : str
            String input to check.

        RETURNS
        -------
        bool
            True if wordList has the given word, False otherwise.
        """
        return Word(word) in self.wordList

    def addParagraph(self, p: Paragraph):
        """
        The addParagraph method takes a Paragraph type input. It gets the sentences in the given paragraph and
        add these to the sentences list and the words in the sentences to the wordList CounterHashMap.

        PARAMETERS
        ----------
        p : Paragraph
            Paragraph type input to add sentences and wordList.
        """
        self.paragraphs.append(p)
        for i in range(p.sentenceCount()):
            self.addSentence(p.getSentence(i))

    def getFileName(self) -> str:
        """
        Getter for the file name.

        RETURNS
        -------
        str
            file name.
        """
        return self.fileName

    def getWordList(self) -> set:
        """
        Getter for the wordList.

        RETURNS
        -------
        set
            The keySet of wordList.
        """
        return set(self.wordList.keys())

    def wordCount(self) -> int:
        """
        The wordCount method returns the size of the wordList CounterHashMap.

        RETURNS
        -------
        int
            The size of the wordList CounterHashMap.
        """
        return len(self.wordList)

    def getCount(self, word: Word) -> int:
        """
        The getCount method returns the count value of given word.

        PARAMETERS
        ----------
        word : Word
            Word type input to check.

        RETURNS
        -------
        int
            The count value of given word.
        """
        return self.wordList[word]

    def sentenceCount(self) -> int:
        """
        The sentenceCount method returns the size of the sentences list.

        RETURNS
        -------
        int
            The size of the sentences list.
        """
        return len(self.sentences)

    def getSentence(self, index: int) -> Sentence:
        """
        Getter for getting a sentence at given index.

        PARAMETERS
        ----------
        index : int
            index to get sentence from.

        RETURNS
        -------
        Sentence
            The sentence at given index.
        """
        return self.sentences[index]

    def paragraphCount(self) -> int:
        """
        The paragraphCount method returns the size of the paragraphs list.

        RETURNS
        -------
        int
            The size of the paragraphs list.
        """
        return len(self.paragraphs)

    def getParagraph(self, index: int) -> Paragraph:
        """
        Getter for getting a paragraph at given index.

        PARAMETERS
        ----------
        index : int
            index to get paragraph from.

        RETURNS
        -------
        Paragraph
            The paragraph at given index.
        """
        return self.paragraphs[index]

    def maxSentenceLength(self) -> int:
        """
        The maxSentenceLength method finds the sentence with the maximum number of words and returns this number.

        RETURNS
        -------
        int
            maximum length.
        """
        maxLength = 0
        for s in self.sentences:
            if s.wordCount() > maxLength:
                maxLength = s.wordCount()
        return maxLength

    def getAllWordsAsList(self) -> list:
        """
        The getAllWordsAsList method creates new list of lists and adds each word in each sentence of sentences
        list into new list.

        RETURNS
        -------
        list
            Newly created and populated list.
        """
        allWords = []
        for i in range(self.sentenceCount()):
            allWords.append(self.getSentence(i).getWords())
        return allWords

    def shuffleSentences(self, seed: int):
        """
        The shuffleSentences method randomly shuffles sentences list with given seed value.

        PARAMETERS
        ----------
        seed : int
            value to randomize shuffling.
        """
        random.seed(seed)
        random.shuffle(self.sentences)

    def getTrainCorpus(self, foldNo: int, foldCount: int) -> Corpus:
        """
        The getTrainCorpus method takes two integer inputs foldNo and foldCount for determining train data size and
        count of fold respectively. Initially creates a new empty Corpus, then finds the sentenceCount as N. Then,
        starting from the index 0 it loops through the index (foldNo * N) / foldCount and add each sentence of sentences
        list to new Corpus. Later on, starting from the index ((foldNo + 1) * N) / foldCount, it loops through the index
        N and add each sentence of sentences list to new Corpus.

        PARAMETERS
        ----------
        foldNo : int
            Integer input for train set size.
        foldCount : int
            Integer input for counting fold.

        RETURNS
        -------
        Corpus
            The newly created and populated Corpus.
        """
        trainCorpus = Corpus()
        N = self.sentenceCount()
        for i in range((foldNo * N) // foldCount):
            trainCorpus.addSentence(self.sentences[i])
        for i in range(((foldNo + 1) * N) // foldCount, N):
            trainCorpus.addSentence(self.sentences[i])
        return trainCorpus

    def getTestCorpus(self, foldNo: int, foldCount: int) -> Corpus:
        """
        The getTestCorpus method takes two integer inputs foldNo and foldCount for determining test data size and count
        of fold respectively. Initially creates a new empty Corpus, then finds the sentenceCount as N. Then, starting
        from the index (foldNo * N) / foldCount it loops through the index ((foldNo + 1) * N) / foldCount and add each
        sentence of sentences list to new Corpus.

        PARAMETERS
        ----------
        foldNo : int
            Integer input for test size.
        foldCount : int
            Integer input counting fold.

        RETURNS
        -------
        Corpus
            The newly created and populated Corpus.
        """
        testCorpus = Corpus()
        N = self.sentenceCount()
        for i in range((foldNo * N) // foldCount,
                       ((foldNo + 1) * N) // foldCount):
            testCorpus.addSentence(self.sentences[i])
        return testCorpus
Exemplo n.º 20
0
 def nextWordPos(nextParseList: FsmParseList) -> str:
     _map = CounterHashMap()
     for i in range(nextParseList.size()):
         _map.put(nextParseList.getFsmParse(i).getPos())
     return _map.max()
 def test_Put3(self):
     counterHashMap = CounterHashMap()
     for i in range(1000000):
         counterHashMap.put(randrange(1000000))
     self.assertAlmostEqual(len(counterHashMap) / 1000000.0, 0.632, 3)