def test_convert_featurizer_train( create_or_load_convert_featurizer: Callable[[Dict[Text, Any]], ConveRTFeaturizer], monkeypatch: MonkeyPatch, load: bool, whitespace_tokenizer: WhitespaceTokenizer, ): monkeypatch.setattr( ConveRTFeaturizer, "_validate_model_url", lambda _: None, ) component_config = { FEATURIZER_CLASS_ALIAS: "alias", "model_url": RESTRICTED_ACCESS_URL, } featurizer = create_or_load_convert_featurizer(component_config, load=True) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) whitespace_tokenizer.process_training_data(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.process_training_data(TrainingData([message])) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_convert_featurizer_train(monkeypatch: MonkeyPatch): tokenizer = WhitespaceTokenizer() monkeypatch.setattr(ConveRTFeaturizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL) component_config = { "name": "ConveRTFeaturizer", "model_url": RESTRICTED_ACCESS_URL } featurizer = ConveRTFeaturizer(component_config) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) td = TrainingData([message]) tokenizer.train(td) tokens = featurizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=featurizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_component_does_not_remove_features(tokenizer, featurizer, msg): """If there are features we need to add not remove them""" message = Message({TEXT: msg}) tokenizer.process(message) featurizer.process(message) seq_vecs1, sen_vecs1 = message.get_dense_features(TEXT, []) featurizer.process(message) seq_vecs2, sen_vecs2 = message.get_dense_features(TEXT, []) assert (seq_vecs1.features.shape[1] * 2) == seq_vecs2.features.shape[1] assert (sen_vecs1.features.shape[1] * 2) == sen_vecs2.features.shape[1]
def test_mitie_featurizer_train( create: Callable[[Dict[Text, Any]], MitieFeaturizerGraphComponent], mitie_model: MitieModel, mitie_tokenizer: MitieTokenizerGraphComponent, ): featurizer = create({"alias": "mitie_featurizer"}) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") mitie_tokenizer.process_training_data(TrainingData([message])) featurizer.process_training_data(TrainingData([message]), mitie_model) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert seq_vec is None assert sen_vec is None
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert seq_vec is None assert sen_vec is None
def test_convert_featurizer_process(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class( ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) tokens = tokenizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def print_message(message: Message) -> None: features = {**message.as_dict_nlu()} seq_vecs, sen_vecs = message.get_dense_features(TEXT) features["dense"] = { "sequence": None if not seq_vecs else dense_message(seq_vecs.features), "sentence": None if not sen_vecs else dense_message(sen_vecs.features), } seq_vecs, sen_vecs = message.get_sparse_features(TEXT) features["sparse"] = { "sequence": None if not seq_vecs else sparse_message(seq_vecs.features), "sentence": None if not sen_vecs else sparse_message(sen_vecs.features), } if "text_tokens" in features.keys(): features["text_tokens"] = [t.text for t in features["text_tokens"]] if "intent" in features.keys(): features["intent"] = { k: v for k, v in features["intent"].items() if "id" != k } if "intent_ranking" in features.keys(): features["intent_ranking"] = [{ k: v for k, v in i.items() if "id" != k } for i in features["intent_ranking"]] if "diagnostic_data" in features.keys(): features["diagnostic_data"] = { name: {k: dense_message(v) for k, v in comp.items()} for name, comp in features["diagnostic_data"].items() } print(features)
def test_convert_featurizer_process(component_builder, monkeypatch: MonkeyPatch): monkeypatch.setattr( ConveRTTokenizer, "_get_validated_model_url", lambda x: RESTRICTED_ACCESS_URL ) component_config = {"name": "ConveRTTokenizer", "model_url": RESTRICTED_ACCESS_URL} tokenizer = ConveRTTokenizer(component_config) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(data={TEXT: sentence}) tokens = tokenizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) seq_vecs = seq_vecs.features sent_vecs = sent_vecs.features assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5)
def _get_sentence_features(message: Message) -> np.ndarray: _, sentence_features = message.get_dense_features(TEXT) if sentence_features is not None: return sentence_features.features[0] raise ValueError( "No sentence features present. Not able to train sklearn policy.")
def test_component_no_features_on_no_tokens(tokenizer, featurizer, msg): """The component does not set any dense features if there are no tokens.""" message = Message({TEXT: msg}) featurizer.process(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) assert not seq_vecs assert not sen_vecs
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer doc = spacy_nlp(sentence) token_vectors = [t.vector for t in doc] ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) greet = {TEXT: sentence, "intent": "greet", "text_features": [0.5]} message = Message(data=greet) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features vecs = seq_vecs[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) assert sen_vecs is not None
def _get_dense_features(self, message: Message) -> Optional[List]: """Convert dense features to python-crfsuite feature format.""" features, _ = message.get_dense_features( TEXT, self.component_config["featurizers"] ) if features is None: return None tokens = message.get(TOKENS_NAMES[TEXT]) if len(tokens) != len(features.features): rasa.shared.utils.io.raise_warning( f"Number of dense features ({len(features.features)}) for attribute " f"'TEXT' does not match number of tokens ({len(tokens)}).", docs=DOCS_URL_COMPONENTS + "#crfentityextractor", ) return None # convert to python-crfsuite feature format features_out = [] for feature in features.features: feature_dict = { str(index): token_features for index, token_features in enumerate(feature) } converted = {"text_dense_features": feature_dict} features_out.append(converted) return features_out
def test_spacy_featurizer_train(spacy_nlp): featurizer = create_spacy_featurizer({}) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(RESPONSE, sentence) message.set(INTENT, "intent") message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence)) featurizer.process_training_data(TrainingData([message])) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array( [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert 5 == len(seq_vecs) assert 1 == len(sen_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sen_vecs = message.get_dense_features(RESPONSE, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert 5 == len(seq_vecs) assert 1 == len(sen_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sen_vecs = message.get_dense_features(INTENT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert seq_vecs is None assert sen_vecs is None
def test_component_adds_features(tokenizer, featurizer, msg): """If there are no features we need to add them""" message = Message({TEXT: msg}) tokenizer.process(message) tokens = message.get(TOKENS_NAMES[TEXT]) featurizer.process(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) assert len(seq_vecs.features) == len(tokens)
def _get_sentence_features(message: Message) -> scipy.sparse.spmatrix: _, dense_sentence_features = message.get_dense_features(TEXT) if dense_sentence_features is not None: rasa.shared.utils.io.raise_warning( "Dense features are being computed but not used in " "the SparseNaiveBayesIntentClassifier.") _, sentence_features = message.get_sparse_features(TEXT) if sentence_features is not None: return sentence_features.features raise ValueError("No sparse sentence features present. " "Not able to train sklearn intent classifier.")
def test_crf_use_dense_features( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractorGraphComponent], spacy_nlp: Any, ): component_config = { "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } crf_extractor = crf_entity_extractor(component_config) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(data={TEXT: text}) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features, _ = message.get_dense_features(TEXT, []) if dense_features: dense_features = dense_features.features for i in range(0, len(dense_features[0])): assert (features[0]["0:text_dense_features"]["text_dense_features"][ str(i)] == dense_features[0][i])
def _get_dense_features(self, message: Message) -> Optional[np.ndarray]: """Convert dense features to python-crfsuite feature format.""" features, _ = message.get_dense_features( TEXT, self.component_config["featurizers"]) if features is None: return None tokens = message.get(TOKENS_NAMES[TEXT]) if len(tokens) != len(features.features): rasa.shared.utils.io.raise_warning( f"Number of dense features ({len(features.features)}) for attribute " f"'TEXT' does not match number of tokens ({len(tokens)}).", docs=DOCS_URL_COMPONENTS + "#crfentityextractor", ) return None return features.features
def test_get_dense_features( features: Optional[List[Features]], attribute: Text, featurizers: List[Text], expected_seq_features: Optional[List[Features]], expected_sen_features: Optional[List[Features]], ): message = Message(data={TEXT: "This is a test sentence."}, features=features) actual_seq_features, actual_sen_features = message.get_dense_features( attribute, featurizers) if actual_seq_features: actual_seq_features = actual_seq_features.features if actual_sen_features: actual_sen_features = actual_sen_features.features assert np.all(actual_sen_features == expected_sen_features) assert np.all(actual_seq_features == expected_seq_features)
def test_spacy_featurizer_cls_vector(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) featurizer._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array([-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) assert 5 == len(seq_vecs) assert 1 == len(sen_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sen_vecs[-1][:5], expected_cls, atol=1e-5)
def test_spacy_featurizer_using_empty_model(): import spacy sentence = "This test is using an empty spaCy model" model = spacy.blank("en") doc = model(sentence) ftr = create_spacy_featurizer({}) message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert seq_vecs is None assert sen_vecs is None
def test_spacy_featurizer_using_empty_model(): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer import spacy sentence = "This test is using an empty spaCy model" model = spacy.blank("en") doc = model(sentence) ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) message = Message(data={TEXT: sentence}) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert seq_vecs is None assert sen_vecs is None