Пример #1
0
    def constructDictionaryWithNonRareWords(self, level: int, probability: float) -> set:
        """
        Constructs a dictionary of nonrare words with given N-Gram level and probability threshold.

        PARAMETERS
        ----------
        level : int
            Level for counting words. Counts for different levels of the N-Gram can be set. If level = 1, N-Gram is
            treated as UniGram, if level = 2, N-Gram is treated as Bigram, etc.
        probability : float
            probability threshold for nonrare words.

        RETURNS
        -------
        set
            set of nonrare words.
        """
        result = set()
        wordCounter = CounterHashMap()
        self.rootNode.countWords(wordCounter, level)
        total = wordCounter.sumOfCounts()
        for symbol in wordCounter.keys():
            if wordCounter[symbol] / total > probability:
                result.add(symbol)
        return result
Пример #2
0
    def calculateEmissionProbabilities(self, state: object, observations: list,
                                       emittedSymbols: list) -> dict:
        """
        calculateEmissionProbabilities calculates the emission probabilities for a specific state. The method takes the
        state, an array of observations (which also consists of an array of states) and an array of instances (which also
        consists of an array of emitted symbols).

        PARAMETERS
        ----------
        states : set
            A Set of states, consisting of all possible states for this problem.
        observations : list
            An array of instances, where each instance consists of an array of states.
        emittedSymbols : list
            An array of instances, where each instance consists of an array of symbols.

        RETURNS
        -------
        dict
            A HashMap. Emission probabilities for a single state. Contains a probability for each symbol emitted.
        """
        counts = CounterHashMap()
        emissionProbabilities = {}
        for i in range(len(observations)):
            for j in range(len(observations[i])):
                currentState = observations[i][j]
                currentSymbol = emittedSymbols[i][j]
                if currentState == state:
                    counts.put(currentSymbol)
        total = counts.sumOfCounts()
        for symbol in counts:
            emissionProbabilities[symbol] = counts[symbol] / total
        return emissionProbabilities
 def test_PutNTimes2(self):
     counterHashMap = CounterHashMap()
     for i in range(1000):
         counterHashMap.putNTimes(randrange(1000), i + 1)
     self.assertEquals(500500, counterHashMap.sumOfCounts())
 def test_SumOfCounts(self):
     counterHashMap = CounterHashMap()
     for i in range(1000):
         counterHashMap.put(randrange(1000))
     self.assertEquals(1000, counterHashMap.sumOfCounts())