Пример #1
0
    def test_unknown(self):
        hmm = MLHMM(2, self.tagged_sents)

        known = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón'}
        for w in known:
            self.assertFalse(hmm.unknown(w))

        unknown = {'perro', 'salame'}
        for w in unknown:
            self.assertTrue(hmm.unknown(w))
Пример #2
0
    def test_unknown(self):
        hmm = MLHMM(2, self.tagged_sents)

        known = {'el', 'gato', 'come', 'pescado', '.', 'la', 'gata', 'salmón'}
        for w in known:
            self.assertFalse(hmm.unknown(w))

        unknown = {'perro', 'salame'}
        for w in unknown:
            self.assertTrue(hmm.unknown(w))
Пример #3
0
    def test_tag_prob_2gram(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)

        y = 'D N V N P'.split()
        p = hmm.tag_prob(y)
        tag_prob = 0.5 * 0.5
        self.assertAlmostEqual(p, tag_prob)

        lp = hmm.tag_log_prob(y)
        self.assertAlmostEqual(lp, log2(tag_prob))
Пример #4
0
    def test_tag_prob_2gram(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)

        y = 'D N V N P'.split()
        p = hmm.tag_prob(y)
        tag_prob = 0.5 * 0.5
        self.assertAlmostEqual(p, tag_prob)

        lp = hmm.tag_log_prob(y)
        self.assertAlmostEqual(lp, log2(tag_prob))
Пример #5
0
    def test_tag_prob_1gram(self):
        hmm = MLHMM(1, self.tagged_sents, addone=False)

        y = 'D N V N P'.split()
        p = hmm.tag_prob(y)
        # D V P and </s> have prob 2.0 / 12.0, N has prob 4.0 / 12.0.
        tag_prob = (2.0 / 12.0)**4 *  \
                   (4.0 / 12.0)**2
        self.assertAlmostEqual(p, tag_prob)

        lp = hmm.tag_log_prob(y)
        self.assertAlmostEqual(lp, log2(tag_prob))
Пример #6
0
    def test_tag_prob_1gram(self):
        hmm = MLHMM(1, self.tagged_sents, addone=False)

        y = 'D N V N P'.split()
        p = hmm.tag_prob(y)
        # D V P and </s> have prob 2.0 / 12.0, N has prob 4.0 / 12.0.
        tag_prob = (2.0 / 12.0)**4 *  \
                   (4.0 / 12.0)**2
        self.assertAlmostEqual(p, tag_prob)

        lp = hmm.tag_log_prob(y)
        self.assertAlmostEqual(lp, log2(tag_prob))
Пример #7
0
    def test_tcount_1gram(self):
        hmm = MLHMM(1, self.tagged_sents)

        tcount = {
            (): 12,
            ('D', ): 2,
            ('N', ): 4,
            ('V', ): 2,
            ('P', ): 2,
            ('</s>', ): 2,
        }
        for gram, c in tcount.items():
            self.assertEqual(hmm.tcount(gram), c, gram)
Пример #8
0
    def test_trans_prob_2gram(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)

        probs = {
            ('D', ('<s>', )): 1.0,
            ('N', ('D', )): 1.0,
            ('V', ('N', )): 0.5,
            ('N', ('V', )): 1.0,
            ('P', ('N', )): 0.5,
            ('</s>', ('P', )): 1.0,
        }
        for params, p in probs.items():
            self.assertAlmostEqual(hmm.trans_prob(*params), p, msg=params)
Пример #9
0
    def test_trans_prob_2gram(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)

        probs = {
            ('D', ('<s>',)): 1.0,
            ('N', ('D',)): 1.0,
            ('V', ('N',)): 0.5,
            ('N', ('V',)): 1.0,
            ('P', ('N',)): 0.5,
            ('</s>', ('P',)): 1.0,
        }
        for params, p in probs.items():
            self.assertAlmostEqual(hmm.trans_prob(*params), p, msg=params)
Пример #10
0
    def test_tcount_1gram(self):
        hmm = MLHMM(1, self.tagged_sents)

        tcount = {
            (): 12,
            ('D',): 2,
            ('N',): 4,
            ('V',): 2,
            ('P',): 2,
            ('</s>',): 2,
        }
        for gram, c in tcount.items():
            self.assertEqual(hmm.tcount(gram), c, gram)
Пример #11
0
    def test_prob_2gram(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)

        x = 'el gato come pescado .'.split()
        y = 'D N V N P'.split()
        p = hmm.prob(x, y)
        # V after N and P after N have prob 0.5. the rest is 1.0.
        tag_prob = 0.5 * 0.5
        # probs for el/D gato/N come/V pescado/N ./P
        out_prob = 0.5 * 0.25 * 1.0 * 0.25 * 1.0
        self.assertAlmostEqual(p, tag_prob * out_prob)

        lp = hmm.log_prob(x, y)
        self.assertAlmostEqual(lp, log2(tag_prob) + log2(out_prob))
Пример #12
0
    def test_prob_2gram(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)

        x = 'el gato come pescado .'.split()
        y = 'D N V N P'.split()
        p = hmm.prob(x, y)
        # V after N and P after N have prob 0.5. the rest is 1.0.
        tag_prob = 0.5 * 0.5
        # probs for el/D gato/N come/V pescado/N ./P
        out_prob = 0.5 * 0.25 * 1.0 * 0.25 * 1.0
        self.assertAlmostEqual(p, tag_prob * out_prob)

        lp = hmm.log_prob(x, y)
        self.assertAlmostEqual(lp, log2(tag_prob) + log2(out_prob))
Пример #13
0
    def test_prob_1gram(self):
        hmm = MLHMM(1, self.tagged_sents, addone=False)

        x = 'el gato come pescado .'.split()
        y = 'D N V N P'.split()
        p = hmm.prob(x, y)
        # D V P and </s> have prob 2.0 / 12.0, N has prob 4.0 / 12.0.
        tag_prob = (2.0 / 12.0)**4 *  \
                   (4.0 / 12.0)**2
        # probs for el/D gato/N come/V pescado/N ./P
        out_prob = 0.5 * 0.25 * 1.0 * 0.25 * 1.0
        self.assertAlmostEqual(p, tag_prob * out_prob)

        lp = hmm.log_prob(x, y)
        self.assertAlmostEqual(lp, log2(tag_prob) + log2(out_prob))
Пример #14
0
    def test_prob_1gram(self):
        hmm = MLHMM(1, self.tagged_sents, addone=False)

        x = 'el gato come pescado .'.split()
        y = 'D N V N P'.split()
        p = hmm.prob(x, y)
        # D V P and </s> have prob 2.0 / 12.0, N has prob 4.0 / 12.0.
        tag_prob = (2.0 / 12.0)**4 *  \
                   (4.0 / 12.0)**2
        # probs for el/D gato/N come/V pescado/N ./P
        out_prob = 0.5 * 0.25 * 1.0 * 0.25 * 1.0
        self.assertAlmostEqual(p, tag_prob * out_prob)

        lp = hmm.log_prob(x, y)
        self.assertAlmostEqual(lp, log2(tag_prob) + log2(out_prob))
Пример #15
0
    def test_viterbi_tagger(self):
        hmm = MLHMM(2, self.tagged_sents, addone=False)
        # XXX: or directly test hmm.tag?
        tagger = ViterbiTagger(hmm)

        y = tagger.tag('el gato come pescado .'.split())

        pi = {
            0: {
                ('<s>', ): (0.0, []),
            },
            1: {
                # 0.5 for el/D
                (
                    'D', ): (log2(0.5), ['D']),
            },
            2: {
                # 0.25 for gato/N
                (
                    'N', ): (log2(0.5 * 0.25), ['D', 'N']),
            },
            3: {
                # 0.5 for V after N
                (
                    'V', ): (log2(0.5 * 0.25 * 0.5), ['D', 'N', 'V']),
            },
            4: {
                # 0.25 for pescado/N
                (
                    'N', ):
                (log2(0.5 * 0.25 * 0.5 * 0.25), ['D', 'N', 'V', 'N']),
            },
            5: {
                # 0.5 for P after N
                (
                    'P', ): (log2(0.5 * 0.25 * 0.5 * 0.25 * 0.5),
                             ['D', 'N', 'V', 'N', 'P']),
            }
        }
        self.assertEqualPi(tagger._pi, pi)

        self.assertEqual(y, 'D N V N P'.split())
Пример #16
0
    files = 'CESS-CAST-(A|AA|P)/.*\.tbf\.xml'
    corpus = SimpleAncoraCorpusReader('corpus/ancora-2.0/', files)
    sents = list(corpus.tagged_sents())

    # order of the model
    m = str(opts['-m'])
    # train the model
    filename = opts['-o']

    if m == "base":
        print("Baseline Model selected")
        model = BaselineTagger(tagged_sents=sents)
    elif m == "mlhmm":
        n = int(opts['-n'])
        print("Maximum Likelihood Hidden Markov Model selected, n=%d" % n)
        model = MLHMM(n=n, tagged_sents=sents, addone=True)
    elif m == 'memm':
        n = int(opts['-n'])
        c = str(opts['-c'])
        if c not in ['logreg', 'nb', 'svc']:
            print("Bad classifier type, use --help option for help")
            exit()
        print("Maximum Entropy Markov Model selected, n=%d, c=%s" % (n, c))
        model = MEMM(n=n, tagged_sents=sents, classifier=c)
    else:
        print("Bad model type, use --help option for help")
        exit()

    # save it
    f = open(filename, 'wb')
    pickle.dump(model, f)
Пример #17
0
def MLHMM_trainer(tagged_sents):
    return MLHMM(n, tagged_sents)