Пример #1
0
 def printSpanishTags(self):
     sents = cess_esp.tagged_sents()
     tagger = HiddenMarkovModelTagger.train(sents)
     
     fullCorpus = self.fullCorpus()
     tagsDictionary = dict()
     for line in fullCorpus:
         spanishSentence = line[0]
         spanishTokens = re.compile('\W+', re.UNICODE).split(unicode(spanishSentence, 'utf-8'))
         tags = tagger.tag(spanishTokens)
         for idx, token in enumerate(spanishTokens):
             if (len(token) > 0):
                 tag = tags[idx][1]
                 sys.stdout.write(token.encode('utf-8'))
                 sys.stdout.write(":")
                 sys.stdout.write(tag)
                 sys.stdout.write("\n")
# Split into training and test set

# <codecell>

training_dx = int(len(sents)*90/100)
training = sents[:training_dx]
test = sents[training_dx+1:]

# <markdowncell>

# train tagger and check accuracy (this takes 40 seconds or so) ...

# <codecell>

from nltk import HiddenMarkovModelTagger
spanish_tagger = HiddenMarkovModelTagger.train(training)
'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100)

# <codecell>

spanish_tagger.tag(tokenize("A buen entendedor, pocas palabras bastan."))

# <codecell>

spanish_tagger.tag(tokenize("El gato blanco se sentó en la alfombra."))

# <markdowncell>

# Now Portuguese 

# <codecell>
# <codecell>

training_dx = int(len(sents) * 90 / 100)
training = sents[:training_dx]
test = sents[training_dx + 1:]

# <markdowncell>

# train tagger and check accuracy (this takes 40 seconds or so) ...

# <codecell>

from nltk import HiddenMarkovModelTagger

spanish_tagger = HiddenMarkovModelTagger.train(training)
'accuracy %.1f %%' % (spanish_tagger.evaluate(test) * 100)

# <codecell>

spanish_tagger.tag(tokenize("A buen entendedor, pocas palabras bastan."))

# <codecell>

spanish_tagger.tag(tokenize("El gato blanco se sentó en la alfombra."))

# <markdowncell>

# Now Portuguese

# <codecell>
Пример #4
0
    def map(self, key, value):
        """
        establish the hmm model and estimate the local
        hmm parameters from the input sequences

        @param key: None
        @param value: input sequence
        """

        symbols, states, A, B, pi = self.read_params()
        N = len(states)
        M = len(symbols)
        symbol_dict = dict((symbols[i], i) for i in range(M))

        model = HiddenMarkovModelTagger(symbols=symbols, states=states, \
                transitions=A, outputs=B, priors=pi)

        logprob = 0
        sequence = list(value)
        if not sequence:
            return

        # compute forward and backward probabilities
        alpha = model._forward_probability(sequence)
        beta = model._backward_probability(sequence)

        # find the log probability of the sequence
        T = len(sequence)
        lpk = _log_add(*alpha[T-1, :])
        logprob += lpk

        # now update A and B (transition and output probabilities)
        # using the alpha and beta values. Please refer to Rabiner's
        # paper for details, it's too hard to explain in comments
        local_A_numer = ones((N, N), float64) * _NINF
        local_B_numer = ones((N, M), float64) * _NINF
        local_A_denom = ones(N, float64) * _NINF
        local_B_denom = ones(N, float64) * _NINF

        # for each position, accumulate sums for A and B
        for t in range(T):
            x = sequence[t][_TEXT] #not found? FIXME
            if t < T - 1:
                xnext = sequence[t+1][_TEXT] #not found? FIXME
            xi = symbol_dict[x]
            for i in range(N):
                si = states[i]
                if t < T - 1:
                    for j in range(N):
                        sj = states[j]
                        local_A_numer[i, j] =  \
                            _log_add(local_A_numer[i, j],
                                    alpha[t, i] + 
                                    model._transitions[si].logprob(sj) + 
                                    model._outputs[sj].logprob(xnext) +
                                    beta[t+1, j])
                    local_A_denom[i] = _log_add(local_A_denom[i],
                                alpha[t, i] + beta[t, i])
                else:
                    local_B_denom[i] = _log_add(local_A_denom[i],
                            alpha[t, i] + beta[t, i])

                local_B_numer[i, xi] = _log_add(local_B_numer[i, xi],
                        alpha[t, i] + beta[t, i])

        for i in range(N):
            self.outputcollector.collect("parameters", \
                    tuple2str(("Pi", states[i], pi.prob(states[i]))))

        self.collect_matrix('A', local_A_numer, lpk, N, N)
        self.collect_matrix('B', local_B_numer, lpk, N, M)
        self.collect_matrix('A_denom', [local_A_denom], lpk, 1, N)
        self.collect_matrix('B_denom', [local_B_denom], lpk, 1, N)

        self.outputcollector.collect("parameters", "states " + \
                tuple2str(tuple(states)))
        self.outputcollector.collect("parameters", "symbols " + \
                tuple2str(tuple(symbols)))