def test_eq(self): h1 = History('el gato come pescado .'.split(), ('<s>', '<s>'), 0) h2 = History('el gato come pescado .'.split(), ('<s>', '<s>'), 0) self.assertEqual(h1, h2) h3 = History('la gata come salmón .'.split(), ('<s>', '<s>'), 0) self.assertNotEqual(h1, h3)
def tag(self, sent): """Tag a sentence. sent -- the sentence. """ prev_tags = ('<s>', ) * (self.n - 1) tags = [self.tag_history(History(sent, prev_tags, 0))] for i in range(1, len(sent)): prev_tags = (prev_tags + (tags[i - 1], ))[1:] tags.append(self.tag_history(History(sent, prev_tags, i))) return tags
def test_sent_histories_1gram(self): model = MEMM(1, self.tagged_sents) hs = list(model.sent_histories(self.tagged_sents[0])) sent = 'el gato come pescado .'.split() hs2 = [ History(sent, (), 0), History(sent, (), 1), History(sent, (), 2), History(sent, (), 3), History(sent, (), 4), ] self.assertEqual(hs, hs2)
def test_sent_histories_3gram(self): model = MEMM(3, self.tagged_sents) hs = list(model.sent_histories(self.tagged_sents[0])) sent = 'el gato come pescado .'.split() hs2 = [ History(sent, ('<s>', '<s>'), 0), History(sent, ('<s>', 'D'), 1), History(sent, ('D', 'N'), 2), History(sent, ('N', 'V'), 3), History(sent, ('V', 'N'), 4), ] self.assertEqual(hs, hs2)
def sent_histories(self, tagged_sent): """ Iterator over the histories of a tagged sentence. tagged_sent -- the tagged sentence (a list of pairs (word, tag)). """ # Recordar: # History = namedtuple('History', 'sent prev_tags i') # sent -- the whole sentence. # prev_tags -- a tuple with the n previous tags. # i -- the position to be tagged. n = self.n words = [word for word, tag in tagged_sent] # W[1:n] = sent tags = [tag for word, tag in tagged_sent] # lista de tags tags = ["<s>" ] * (n - 1) + tags # Lista de tags para los casos de borde m = len(words) # Largo de la lista my_histories = [] for i in range(m): prev_tags = tuple(tags[i:i + n - 1]) # n tags previos a la posicion i # print(i, prev_tags) my_histories += [History(words, prev_tags, i)] return my_histories
def tag(self, sent): """Tag a sentence. sent -- the sentence. """ n = self.n tags = [] prev_tags = ['<s>'] * (n - 1) tags.append(self.tag_history(History(sent, prev_tags, 0))) for index in range(1, len(sent)): prev_tags = (prev_tags + [tags[index - 1]])[1:] h = History(sent, prev_tags, index) tags.append(self.tag_history(h)) return tags
def test_prev_tags(self): sent0 = 'El gato come pescado .'.split() sent1 = 'La gata come salmón .'.split() feature_values = [ (History(sent0, ('<s>', '<s>'), 0), ('<s>', '<s>')), (History(sent0, ('<s>', 'D'), 1), ('<s>', 'D')), (History(sent0, ('D', 'N'), 2), ('D', 'N')), (History(sent0, ('N', 'V'), 3), ('N', 'V')), (History(sent0, ('V', 'N'), 4), ('V', 'N')), (History(sent1, ('<s>', '<s>'), 0), ('<s>', '<s>')), (History(sent1, ('<s>', 'D'), 1), ('<s>', 'D')), (History(sent1, ('D', 'N'), 2), ('D', 'N')), (History(sent1, ('N', 'V'), 3), ('N', 'V')), (History(sent1, ('V', 'N'), 4), ('V', 'N')), ] for h, v in feature_values: self.assertEqual(prev_tags(h), v)
def test_word_isupper(self): sent0 = 'EL gato come pescado .'.split() sent1 = 'La gata come SALMÓN .'.split() feature_values = [ (History(sent0, ('<s>', '<s>'), 0), True), (History(sent0, ('<s>', 'D'), 1), False), (History(sent0, ('D', 'N'), 2), False), (History(sent0, ('N', 'V'), 3), False), (History(sent0, ('V', 'N'), 4), False), (History(sent1, ('<s>', '<s>'), 0), False), (History(sent1, ('<s>', 'D'), 1), False), (History(sent1, ('D', 'N'), 2), False), (History(sent1, ('N', 'V'), 3), True), (History(sent1, ('V', 'N'), 4), False), ] for h, v in feature_values: self.assertEqual(word_isupper(h), v, h)
def test_word_lower(self): sent0 = 'El gato come pescado .'.split() sent1 = 'La gata come salmón .'.split() feature_values = [ (History(sent0, ('<s>', '<s>'), 0), 'el'), (History(sent0, ('<s>', 'D'), 1), 'gato'), (History(sent0, ('D', 'N'), 2), 'come'), (History(sent0, ('N', 'V'), 3), 'pescado'), (History(sent0, ('V', 'N'), 4), '.'), (History(sent1, ('<s>', '<s>'), 0), 'la'), (History(sent1, ('<s>', 'D'), 1), 'gata'), (History(sent1, ('D', 'N'), 2), 'come'), (History(sent1, ('N', 'V'), 3), 'salmón'), (History(sent1, ('V', 'N'), 4), '.'), ] for h, v in feature_values: self.assertEqual(word_lower(h), v)
def tag(self, sent): """ Tag a sentence. sent -- the sentence. """ n = self.n m = len(sent) # Largo de la oracion prev_tags = ("<s>", ) * (n - 1) history = History(sent, prev_tags, 0) my_tagging = [self.tag_history(history)] for i in range(1, m): prev_tags = (prev_tags + (my_tagging[i - 1], ))[1:] history = History(sent, prev_tags, i) my_tagging += [self.tag_history(history)] return my_tagging
def test_prev_word_istitle(self): prev_word_istitle = PrevWord(word_istitle) sent0 = 'EL gato come pescado .'.split() sent1 = 'La gata come SALMÓN .'.split() feature_values = [ (History(sent0, ('<s>', '<s>'), 0), 'BOS'), (History(sent0, ('<s>', 'D'), 1), 'False'), (History(sent0, ('D', 'N'), 2), 'False'), (History(sent0, ('N', 'V'), 3), 'False'), (History(sent0, ('V', 'N'), 4), 'False'), (History(sent1, ('<s>', '<s>'), 0), 'BOS'), (History(sent1, ('<s>', 'D'), 1), 'True'), (History(sent1, ('D', 'N'), 2), 'False'), (History(sent1, ('N', 'V'), 3), 'False'), (History(sent1, ('V', 'N'), 4), 'False'), ] for h, v in feature_values: self.assertEqual(prev_word_istitle(h), v)
def test_prev_word_lower(self): prev_word_lower = PrevWord(word_lower) sent0 = 'El gato come pescado .'.split() sent1 = 'La gata come salmón .'.split() feature_values = [ (History(sent0, ('<s>', '<s>'), 0), 'BOS'), # beginning of sentence (History(sent0, ('<s>', 'D'), 1), 'el'), (History(sent0, ('D', 'N'), 2), 'gato'), (History(sent0, ('N', 'V'), 3), 'come'), (History(sent0, ('V', 'N'), 4), 'pescado'), (History(sent1, ('<s>', '<s>'), 0), 'BOS'), # beginning of sentence (History(sent1, ('<s>', 'D'), 1), 'la'), (History(sent1, ('D', 'N'), 2), 'gata'), (History(sent1, ('N', 'V'), 3), 'come'), (History(sent1, ('V', 'N'), 4), 'salmón'), ] for h, v in feature_values: self.assertEqual(prev_word_lower(h), v)
def sent_histories(self, tagged_sent): """ Iterator over the histories of a tagged sentence. tagged_sent -- the tagged sentence (a list of pairs (word, tag)). """ prev_tags = ('<s>', ) * (self.n - 1) sent = [w for w, _ in tagged_sent] for i, (w, t) in enumerate(tagged_sent): yield History(sent, prev_tags, i) prev_tags = (prev_tags + (t, ))[1:]
def sent_histories(self, tagged_sent): """ Iterator over the histories of a tagged sentence. tagged_sent -- the tagged sentence (a list of pairs (word, tag)). """ n = self.n sent, tags = zip(*tagged_sent) if len(tagged_sent) != 0 else ((), ()) sent = list(sent) tags = (START, ) * (n - 1) + tags return [History(sent, tags[i:i + n - 1], i) for i in range(len(sent))]
def tag(self, sent): """Tag a sentence using beam inference with beam of size 1. sent -- the sentence. """ prev_tags = ('<s>',) * (self.n - 1) tags = [] for i, _ in enumerate(sent): h = History(sent, prev_tags, i) tag = self.tag_history(h) tags += [tag] prev_tags = (prev_tags + (tag,))[1:] return tags
def tag(self, sent): """Tag a sentence. sent -- the sentence. """ n = self.n prev_tags = (START, ) * (n - 1) tagging = [] for i in range(len(sent)): h = History(sent=sent, prev_tags=prev_tags, i=i) tag = self.tag_history(h) prev_tags = (prev_tags + (tag, ))[1:] tagging.append(tag) return tagging
def sent_histories(self, tagged_sent): """ Iterator over the histories of a tagged sentence. tagged_sent -- the tagged sentence (a list of pairs (word, tag)). """ n = self.n result = [] sent = [] tags = ['<s>'] * (n - 1) for word, tag in tagged_sent: sent.append(word) tags.append(tag) for i in range(len(tagged_sent)): prev_tags = tuple(tags[i:i + n - 1]) result.append(History(sent, prev_tags, i)) return result
def sent_histories(self, tagged_sent): """ Iterator over the histories of a tagged sentence. tagged_sent -- the tagged sentence (a list of pairs (word, tag)). """ n = self.n if not tagged_sent: return [] words, tags = zip(*tagged_sent) tags = ('<s>', ) * (n - 1) + tags sent = list(words) return [ History(sent, tags[index:index + n - 1], index) for index in range(len(words)) ]
def test_word_isdigit(self): sent0 = 'El gato come 3 pescados .'.split() sent1 = 'Las 10 gatas c0m3n salmón .'.split() feature_values = [ (History(sent0, ('<s>', '<s>'), 0), False), (History(sent0, ('<s>', 'D'), 1), False), (History(sent0, ('D', 'N'), 2), False), (History(sent0, ('N', 'V'), 3), True), (History(sent0, ('V', 'C'), 4), False), (History(sent0, ('C', 'N'), 5), False), (History(sent1, ('<s>', '<s>'), 0), False), (History(sent1, ('<s>', 'D'), 1), True), (History(sent1, ('D', 'C'), 2), False), (History(sent1, ('C', 'N'), 3), False), (History(sent1, ('N', 'V'), 4), False), (History(sent1, ('V', 'N'), 5), False), ] for h, v in feature_values: self.assertEqual(word_isdigit(h), v, h)