def test_mitie(): from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer tk = MITIETokenizer() tk.tokenize(u"Hi. My name is rasa") == [ u'Hi', u'My', u'name', u'is', u'rasa' ] tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι.") == [u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι'] tk.tokenize_with_offsets(u"Forecast for lunch") == ([ u'Forecast', u'for', u'lunch' ], [0, 9, 13])
def test_mitie(): from rasa_nlu.tokenizers.mitie_tokenizer import MITIETokenizer tk = MITIETokenizer() assert tk.tokenize(u"Hi. My name is rasa") == [ u'Hi', u'My', u'name', u'is', u'rasa' ] assert tk.tokenize(u"ὦ ἄνδρες ᾿Αθηναῖοι") == [ u'ὦ', u'ἄνδρες', u'᾿Αθηναῖοι' ] assert tk.tokenize_with_offsets(u"Forecast for lunch") == ([ u'Forecast', u'for', u'lunch' ], [0, 9, 13]) assert tk.tokenize_with_offsets(u"hey ńöñàśçií how're you?") == ([ u'hey', u'ńöñàśçií', u'how', u'\'re', 'you', '?' ], [0, 4, 13, 16, 20, 23])
def find_entity(cls, ent, text): tk = MITIETokenizer() tokens, offsets = tk.tokenize_with_offsets(text) if ent["start"] not in offsets: message = u"invalid entity {0} in example {1}:".format(ent, text) + \ u" entities must span whole tokens" raise ValueError(message) start = offsets.index(ent["start"]) _slice = text[ent["start"]:ent["end"]] val_tokens = tokenize(_slice) end = start + len(val_tokens) return start, end