def test_mitie(): from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer tk = MitieTokenizer() assert [t.text for t in tk.tokenize("Forecast for lunch")] == ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13] assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?")] == ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'] assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?")] == [0, 4, 13, 16, 20, 23]
def find_entity(ent, text): from mitie import tokenize tk = MitieTokenizer() tokens, offsets = tk.tokenize_with_offsets(text) if ent["start"] not in offsets: message = u"Invalid entity {0} in example '{1}':".format(ent, text) + \ u" entities must span whole tokens" raise ValueError(message) start = offsets.index(ent["start"]) _slice = text[ent["start"]:ent["end"]] val_tokens = tokenize(_slice) end = start + len(val_tokens) return start, end
def test_mitie_featurizer(mitie_feature_extractor, default_config): from rasa_nlu.featurizers.mitie_featurizer import MitieFeaturizer ftr = MitieFeaturizer.create(config.load("sample_configs/config_mitie.yml")) sentence = "Hey how are you today" tokens = MitieTokenizer().tokenize(sentence) vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_mitie_featurizer(mitie_feature_extractor, default_config): from rasa_nlu.featurizers.mitie_featurizer import MitieFeaturizer mitie_component_config = {'name': "MitieFeaturizer"} ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig()) sentence = "Hey how are you today" tokens = MitieTokenizer().tokenize(sentence) vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert np.allclose(vecs[:5], expected, atol=1e-5)
def test_mitie_featurizer(mitie_feature_extractor, default_config): from rasa_nlu.featurizers.mitie_featurizer import MitieFeaturizer default_config["mitie_file"] = os.environ.get('MITIE_FILE') if not default_config["mitie_file"] or not os.path.isfile(default_config["mitie_file"]): default_config["mitie_file"] = os.path.join("data", "total_word_feature_extractor.dat") ftr = MitieFeaturizer.load() sentence = "Hey how are you today" tokens = MitieTokenizer().tokenize(sentence) vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor) assert np.allclose(vecs[:5], np.array([0., -4.4551446, 0.26073121, -1.46632245, -1.84205751]), atol=1e-5)
def tokenizer_from_name(name, language): from rasa_nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer from rasa_nlu.tokenizers.spacy_tokenizer import SpacyTokenizer if name == MitieTokenizer.name: return MitieTokenizer() elif name == SpacyTokenizer.name: import spacy nlp = spacy.load(language, parser=False, entity=False) return SpacyTokenizer(nlp) elif name == WhitespaceTokenizer.name: return WhitespaceTokenizer()
def test_mitie(): from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer tk = MitieTokenizer() assert tk.tokenize("Hi. My name is rasa") == ['Hi', 'My', 'name', 'is', 'rasa'] assert tk.tokenize("ὦ ἄνδρες ᾿Αθηναῖοι") == ['ὦ', 'ἄνδρες', '᾿Αθηναῖοι'] assert tk.tokenize_with_offsets("Forecast for lunch") == (['Forecast', 'for', 'lunch'], [0, 9, 13]) assert tk.tokenize_with_offsets("hey ńöñàśçií how're you?") == ( ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'], [0, 4, 13, 16, 20, 23])
def test_mitie(): from rasa_nlu.tokenizers.mitie_tokenizer import MitieTokenizer tk = MitieTokenizer() assert [t.text for t in tk.tokenize("Forecast for lunch") ] == ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13] assert [t.text for t in tk.tokenize("hey ńöñàśçií how're you?") ] == ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'] assert [t.offset for t in tk.tokenize("hey ńöñàśçií how're you?") ] == [0, 4, 13, 16, 20, 23]