def test_spacy_featurizer_train(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(sentence)) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array( [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) vecs = message.get_dense_features(TEXT, []) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get_dense_features(RESPONSE, []) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get_dense_features(INTENT, []) assert vecs is None
def _get_dense_features(self, message: Message) -> Optional[List]: """Convert dense features to python-crfsuite feature format.""" features = message.get_dense_features( TEXT, self.component_config["featurizers"]) if features is None: return None tokens = message.get(TOKENS_NAMES[TEXT]) if len(tokens) != len(features): common_utils.raise_warning( f"Number of dense features ({len(features)}) for attribute " f"'TEXT' does not match number of tokens ({len(tokens)}).", docs=DOCS_URL_COMPONENTS + "#crfentityextractor", ) return None # convert to python-crfsuite feature format features_out = [] for feature in features: feature_dict = { str(index): token_features for index, token_features in enumerate(feature) } converted = {"text_dense_features": feature_dict} features_out.append(converted) return features_out
def process(self, message: Message, **kwargs: Any) -> None: """Return the most likely intent and its probability for a message.""" if not self.clf: # component is either not trained or didn't # receive enough training data intent = None intent_ranking = [] else: X = train_utils.sequence_to_sentence_features( message.get_dense_features(TEXT) ).reshape(1, -1) intent_ids, probabilities = self.predict(X) intents = self.transform_labels_num2str(np.ravel(intent_ids)) # `predict` returns a matrix as it is supposed # to work for multiple examples as well, hence we need to flatten probabilities = probabilities.flatten() if intents.size > 0 and probabilities.size > 0: ranking = list(zip(list(intents), list(probabilities)))[ :LABEL_RANKING_LENGTH ] intent = {"name": intents[0], "confidence": probabilities[0]} intent_ranking = [ {"name": intent_name, "confidence": score} for intent_name, score in ranking ] else: intent = {"name": None, "confidence": 0.0} intent_ranking = [] message.set("intent", intent, add_to_output=True) message.set("intent_ranking", intent_ranking, add_to_output=True)
def test_get_dense_features( features: Optional[List[Features]], attribute: Text, featurizers: List[Text], expected_features: Optional[List[Features]], ): message = Message("This is a test sentence.", features=features) actual_features = message.get_dense_features(attribute, featurizers) assert np.all(actual_features == expected_features)
def test_convert_featurizer_train(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class( ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = tokenizer.tokenize(message, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig(), tf_hub_module=tokenizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) seq_vecs, sent_vecs = message.get_dense_features(TEXT, []) assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(RESPONSE, []) assert len(tokens) == len(seq_vecs) assert np.allclose(seq_vecs[0][:5], expected, atol=1e-5) assert np.allclose(sent_vecs[-1][:5], expected_cls, atol=1e-5) seq_vecs, sent_vecs = message.get_dense_features(INTENT, []) assert seq_vecs is None assert sent_vecs is None
def test_mitie_featurizer_train(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") MitieTokenizer().train(TrainingData([message])) featurizer.train( TrainingData([message]), RasaNLUModelConfig(), **{"mitie_feature_extractor": mitie_feature_extractor}, ) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) seq_vec, sen_vec = message.get_dense_features(TEXT, []) assert len(message.get(TOKENS_NAMES[TEXT])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(RESPONSE, []) assert len(message.get(TOKENS_NAMES[RESPONSE])) == len(seq_vec) assert np.allclose(seq_vec[0][:5], expected, atol=1e-5) assert np.allclose(sen_vec[-1][:5], expected_cls, atol=1e-5) seq_vec, sen_vec = message.get_dense_features(INTENT, []) assert seq_vec is None assert sen_vec is None
def test_spacy_featurizer_cls_vector(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) featurizer._set_spacy_features(message) vecs = message.get_dense_features(TEXT, []) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array( [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer doc = spacy_nlp(sentence) token_vectors = [t.vector for t in doc] ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) greet = {"intent": "greet", "text_features": [0.5]} message = Message(sentence, greet) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) vecs = message.get_dense_features(TEXT, [])[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4)
def test_spacy_featurizer_using_empty_model(): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer import spacy sentence = "This test is using an empty spaCy model" model = spacy.blank("en") doc = model(sentence) ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) message = Message(sentence) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) vecs = message.get_dense_features(TEXT) assert vecs is None
def test_crf_use_dense_features(spacy_nlp: Any): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._convert_to_crf_tokens(message) features = crf_extractor._crf_tokens_to_features(text_data) assert "0:text_dense_features" in features[0] dense_features = message.get_dense_features(TEXT, []) for i in range(0, len(dense_features[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == dense_features[0][i] )
def test_convert_featurizer_process(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class( ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(sentence) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get_dense_features(TEXT, []) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def _get_sentence_features(message: Message) -> np.ndarray: _, sentence_features = message.get_dense_features(TEXT) return sentence_features[0]