def test_convert_featurizer_process(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) featurizer.process(message, tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def test_convert_featurizer_train(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_convert_featurizer_process(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) show_message(message) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) assert show_message(message, False) == { "tokens": ["hey", "how", "are", "you", "today", "__CLS__"], "text": "Hey how are you today ?" } featurizer.process(message) show_message(message) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert len(vecs) == 6 assert len(tokens) == 6 assert len(vecs[0]) == 1024 assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_convert_featurizer_tokens_to_text(component_builder, sentence, expected_text): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) tokens = tokenizer.tokenize(Message(sentence), attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_raise_invalid_urls(model_url: Optional[Text], exception_phrase: Text): component_config = {"name": "ConveRTFeaturizer", "model_url": model_url} with pytest.raises(RasaException) as excinfo: _ = ConveRTFeaturizer(component_config) assert exception_phrase in str(excinfo.value)
def test_convert_featurizer_tokens_to_text( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_convert_featurizer_tokens_to_text(sentence, expected_text): tokens = ConveRTTokenizer().tokenize(Message(sentence), attribute=TEXT_ATTRIBUTE) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_convert_featurizer_tokens_to_text(sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_raise_wrong_model_directory(tmp_path: Path): component_config = { "name": "ConveRTFeaturizer", "model_url": str(tmp_path) } with pytest.raises(RasaException) as excinfo: _ = ConveRTFeaturizer(component_config) assert "Re-check the files inside the directory" in str(excinfo.value)
def test_raise_invalid_path(): component_config = { "name": "ConveRTFeaturizer", "model_url": "saved_model.pb" } with pytest.raises(RasaException) as excinfo: _ = ConveRTFeaturizer(component_config) assert "neither a valid remote URL nor a local directory" in str( excinfo.value)
def test_raise_wrong_model_file(tmp_path: Path): # create a dummy file temp_file = os.path.join(tmp_path, "saved_model.pb") f = open(temp_file, "wb") f.close() component_config = {"name": "ConveRTFeaturizer", "model_url": temp_file} with pytest.raises(RasaException) as excinfo: _ = ConveRTFeaturizer(component_config) assert "set to the path of a file which is invalid" in str(excinfo.value)
def test_convert_featurizer_number_of_sub_tokens( text: Text, expected_number_of_sub_tokens: List[int], monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=text) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.get(NUMBER_OF_SUB_TOKENS) for t in tokens] == expected_number_of_sub_tokens
def test_convert_featurizer_tokens_to_text( sentence: Text, expected_text: Text, monkeypatch: MonkeyPatch ): monkeypatch.setattr( ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL ) component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} tokenizer = ConveRTTokenizer(component_config) tokens = tokenizer.tokenize(Message(data={TEXT: sentence}), attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text
def test_convert_featurizer_token_edge_cases( text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]], monkeypatch: MonkeyPatch, ): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) message = Message.build(text=text) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import CountVectorsFeaturizer from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, SPARSE_FEATURE_NAMES logger = logging_setup() featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) test_input = "Okay, pick up this yellow banana for me." message = Message(test_input) message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input)) featurizer._set_spacy_features(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) logger.info("SpaCy: {}".format(vecs.shape)) message = Message(test_input) featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT]) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) logger.info("Mitie: {}".format(vecs.shape)) featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) logger.info("ConveRT: {}".format(vecs.shape))