def test_count_vector_featurizer_char( sentence: Text, expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "min_ngram": 1, "max_ngram": 2, "analyzer": "char", }) train_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer( sentence: Text, expected: List[List[int]], expected_cls: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer() train_message = Message(data={TEXT: sentence}) test_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) whitespace_tokenizer.process([test_message]) ftr.train(TrainingData([train_message])) ftr.process([test_message]) seq_vecs, sen_vecs = test_message.get_sparse_features(TEXT, []) if seq_vecs: seq_vecs = seq_vecs.features if sen_vecs: sen_vecs = sen_vecs.features assert isinstance(seq_vecs, scipy.sparse.coo_matrix) assert isinstance(sen_vecs, scipy.sparse.coo_matrix) actual_seq_vecs = seq_vecs.toarray() actual_sen_vecs = sen_vecs.toarray() assert np.all(actual_seq_vecs[0] == expected) assert np.all(actual_sen_vecs[-1] == expected_cls)
def test_message_fingerprint_includes_data_and_features( whitespace_tokenizer: WhitespaceTokenizer, ): message = Message(data={TEXT: "This is a test sentence."}) fp1 = message.fingerprint() whitespace_tokenizer.process([message]) fp2 = message.fingerprint() assert fp1 != fp2 message.add_features( Features(scipy.sparse.csr_matrix([1, 1, 0]), FEATURE_TYPE_SEQUENCE, TEXT, "c2",) ) fp3 = message.fingerprint() assert fp2 != fp3 message.add_features( Features(np.ndarray([1, 2, 2]), FEATURE_TYPE_SEQUENCE, TEXT, "c1") ) fp4 = message.fingerprint() assert fp3 != fp4 assert len({fp1, fp2, fp3, fp4}) == 4
def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp): from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer ner_crf_pos_feature_config["features"][1].append("text_dense_features") crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config) spacy_featurizer = SpacyFeaturizer() white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False}) text = "Rasa is a company in Berlin" message = Message(text) message.set("spacy_doc", spacy_nlp(text)) white_space_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._from_text_to_crf(message) features = crf_extractor._sentence_to_features(text_data) assert "0:text_dense_features" in features[0] for i in range(0, len(message.data.get("text_dense_features")[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == message.data.get("text_dense_features")[0][i] )
async def test_train_persist_load_with_composite_entities( crf_entity_extractor: Callable[[Dict[Text, Any]], CRFEntityExtractor], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, whitespace_tokenizer: WhitespaceTokenizer, ): importer = RasaFileImporter( training_data_paths=["data/test/demo-rasa-composite-entities.yml"]) training_data = importer.get_nlu_data() whitespace_tokenizer.process_training_data(training_data) crf_extractor = crf_entity_extractor({}) crf_extractor.train(training_data) message = Message(data={TEXT: "I am looking for an italian restaurant"}) whitespace_tokenizer.process([message]) message2 = copy.deepcopy(message) processed_message = crf_extractor.process([message])[0] loaded_extractor = CRFEntityExtractor.load( CRFEntityExtractor.get_default_config(), default_model_storage, Resource("CRFEntityExtractor"), default_execution_context, ) processed_message2 = loaded_extractor.process([message2])[0] assert processed_message2.fingerprint() == processed_message.fingerprint() assert list(loaded_extractor.entity_taggers.keys()) == list( crf_extractor.entity_taggers.keys())
def test_features_are_sparse( whitespace_tokenizer: WhitespaceTokenizer, semantic_map_featurizer: SemanticMapFeaturizer, ): message = Message.build("word1 word3") whitespace_tokenizer.process(message) semantic_map_featurizer.process(message) for feature in message.features: assert scipy.sparse.issparse(feature.features)
def test_process_tokenizer_action_name(text: Text, expected_tokens: List[Text]): tk = WhitespaceTokenizer({"intent_tokenization_flag": True}) message = Message.build(text=text) message.set(ACTION_NAME, text) tk.process(message) tokens = message.get(TOKENS_NAMES[ACTION_NAME]) assert [t.text for t in tokens] == expected_tokens
def test_process_tokenizer(text, expected_tokens, expected_indices): tk = WhitespaceTokenizer() message = Message(text) tk.process(message) tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
class LaBSEScorer: def __init__(self): self.featurizer = LanguageModelFeaturizer({ "model_name": "bert", "model_weights": "rasa/LaBSE" }) self.tokenizer = WhitespaceTokenizer() @staticmethod def compute_similarity_score(feature_vec_a: np.ndarray, feature_vec_b: np.ndarray): return 1 - cosine(feature_vec_a, feature_vec_b) def compute_similarity_for_pair(self, a: Message, b: Message): features_a = a.features[0].features features_b = b.features[0].features return self.compute_similarity_score(features_a, features_b) def compute_features(self, example: Message): self.tokenizer.process(example) self.featurizer.process(example) def compute_similarity_with_paraphrases(self, example: Message): # Set features for text of example itself first. self.featurizer.process(example) paraphrases = example.get("metadata").get("paraphrases") similarity_scores = [] # construct individual message for each paraphrase for paraphrase in paraphrases: message = Message.build(text=paraphrase) self.compute_features(message) similarity = self.compute_similarity_for_pair(example, message) similarity_scores.append(similarity) return similarity_scores def compute_similarities(self, examples: List[Message]) -> List[List[float]]: scores_for_collection = [] for example in examples: similarity_scores = self.compute_similarity_with_paraphrases( example) scores_for_collection.append(similarity_scores) return scores_for_collection
def test_process_tokenizer(text: Text, expected_tokens: List[Text], expected_indices: List[Tuple[int]]): tk = WhitespaceTokenizer() message = Message.build(text=text) tk.process(message) tokens = message.get(TOKENS_NAMES[TEXT]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def process_texts(texts: List[Text], model_name: Text, model_weights: Text) -> List[Message]: config = create_pretrained_transformers_config(model_name, model_weights) whitespace_tokenizer = WhitespaceTokenizer() transformer = HFTransformersNLP(config) messages = [] for text in texts: message = Message.build(text=text) whitespace_tokenizer.process(message) transformer.process(message) messages.append(message) return messages
def test_feature_shapes( whitespace_tokenizer: WhitespaceTokenizer, semantic_map_featurizer: SemanticMapFeaturizer, ): message = Message.build("word1 word3") whitespace_tokenizer.process(message) semantic_map_featurizer.process(message) for feature in message.features: assert (feature.type == FEATURE_TYPE_SEQUENCE and feature.features.shape == (2, 37)) or (feature.type == FEATURE_TYPE_SENTENCE and feature.features.shape == (1, 37))
def process_messages( texts: List[Text], model_name: Text, model_weights: Text, create_language_model_featurizer: Callable[[Dict[Text, Any]], LanguageModelFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ) -> List[Message]: """ Creates a featurizer and processes messages """ config = create_pretrained_transformers_config(model_name, model_weights) lm_featurizer = create_language_model_featurizer(config) messages = [] for text in texts: message = Message.build(text=text) whitespace_tokenizer.process([message]) messages.append(message) lm_featurizer.process(messages) return messages
def test_count_vector_featurizer_process_by_attribute( sentence: Text, action_name: Text, action_text: Text, action_name_features: np.ndarray, response_features: np.ndarray, create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer({ "token_pattern": r"(?u)\b\w+\b", }) # add a second example that has some response, so that the vocabulary for # response exists train_message = Message(data={TEXT: "hello"}) train_message.set(ACTION_NAME, "greet") train_message1 = Message(data={TEXT: "hello"}) train_message1.set(ACTION_TEXT, "hi") data = TrainingData([train_message, train_message1]) whitespace_tokenizer.process_training_data(data) ftr.train(data) test_message = Message(data={TEXT: sentence}) test_message.set(ACTION_NAME, action_name) test_message.set(ACTION_TEXT, action_text) whitespace_tokenizer.process([test_message]) ftr.process([test_message]) action_name_seq_vecs, action_name_sen_vecs = test_message.get_sparse_features( ACTION_NAME, []) if action_name_seq_vecs: action_name_seq_vecs = action_name_seq_vecs.features if action_name_sen_vecs: action_name_sen_vecs = action_name_sen_vecs.features assert action_name_seq_vecs.toarray()[0] == action_name_features assert action_name_sen_vecs is None
def test_count_vector_featurizer_oov_words( sentence: Text, expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): ftr = create_featurizer( {"OOV_token": "__oov__", "OOV_words": ["oov_word0", "OOV_word1"]} ) train_message = Message(data={TEXT: sentence}) whitespace_tokenizer.process([train_message]) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_whitespace_processing_with_attribute( supervised_embeddings_config: RasaNLUModelConfig, ): message = Message( data={ TEXT: "Any Mexican restaurant will do", "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }) expected_tokens_intent = ["restaurant_search"] expected_tokens_text = ["Any", "Mexican", "restaurant", "will", "do"] component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) tk.process(message) tokens_intent = message.get(TOKENS_NAMES[INTENT]) tk.process(message) tokens_text = message.get(TOKENS_NAMES[TEXT]) assert [t.text for t in tokens_intent] == expected_tokens_intent assert [t.text for t in tokens_text] == expected_tokens_text message = Message( data={ TEXT: "Where are you going?", ACTION_NAME: "Where are you going?", ACTION_TEXT: "Where are you going?", }) expected_action_tokens_text = ["Where", "are", "you", "going"] component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) tk.process(message) tokens_action_text = message.get(TOKENS_NAMES[ACTION_TEXT]) tk.process(message) tokens_text = message.get(TOKENS_NAMES[TEXT]) assert [t.text for t in tokens_action_text] == expected_action_tokens_text assert [t.text for t in tokens_text] == expected_action_tokens_text
def test_count_vector_featurizer_persist_load( create_featurizer: Callable[..., CountVectorsFeaturizer], load_featurizer: Callable[..., CountVectorsFeaturizer], whitespace_tokenizer: WhitespaceTokenizer, ): # set non default values to config config = { "analyzer": "char", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = create_featurizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(data={TEXT: sentence1}) train_message2 = Message(data={TEXT: sentence2}) whitespace_tokenizer.process([train_message1]) whitespace_tokenizer.process([train_message2]) data = TrainingData([train_message1, train_message2]) train_ftr.train(data) train_ftr.process_training_data(data) # persist featurizer train_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in train_ftr.vectorizers.items() } # add trained vocabulary to vectorizer params for attribute, attribute_vect_params in train_vect_params.items(): if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"): train_vect_params[attribute].update( {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_} ) test_ftr = load_featurizer(config) test_vect_params = { attribute: vectorizer.get_params() for attribute, vectorizer in test_ftr.vectorizers.items() } assert train_vect_params == test_vect_params # check if vocaculary was loaded correctly assert hasattr(test_ftr.vectorizers[TEXT], "vocabulary_") test_message1 = Message(data={TEXT: sentence1}) whitespace_tokenizer.process([test_message1]) test_ftr.process([test_message1]) test_message2 = Message(data={TEXT: sentence2}) whitespace_tokenizer.process([test_message2]) test_ftr.process([test_message2]) test_seq_vec_1, test_sen_vec_1 = test_message1.get_sparse_features(TEXT, []) if test_seq_vec_1: test_seq_vec_1 = test_seq_vec_1.features if test_sen_vec_1: test_sen_vec_1 = test_sen_vec_1.features train_seq_vec_1, train_sen_vec_1 = train_message1.get_sparse_features(TEXT, []) if train_seq_vec_1: train_seq_vec_1 = train_seq_vec_1.features if train_sen_vec_1: train_sen_vec_1 = train_sen_vec_1.features test_seq_vec_2, test_sen_vec_2 = test_message2.get_sparse_features(TEXT, []) if test_seq_vec_2: test_seq_vec_2 = test_seq_vec_2.features if test_sen_vec_2: test_sen_vec_2 = test_sen_vec_2.features train_seq_vec_2, train_sen_vec_2 = train_message2.get_sparse_features(TEXT, []) if train_seq_vec_2: train_seq_vec_2 = train_seq_vec_2.features if train_sen_vec_2: train_sen_vec_2 = train_sen_vec_2.features # check that train features and test features after loading are the same assert np.all(test_seq_vec_1.toarray() == train_seq_vec_1.toarray()) assert np.all(test_sen_vec_1.toarray() == train_sen_vec_1.toarray()) assert np.all(test_seq_vec_2.toarray() == train_seq_vec_2.toarray()) assert np.all(test_sen_vec_2.toarray() == train_sen_vec_2.toarray())
def test_whitespace_with_case(): from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [ "forecast", "for", "lunch", ] component_config = {"case_sensitive": True} tk = WhitespaceTokenizer(component_config) assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [ "Forecast", "for", "LUNCH", ] component_config = {} tk = WhitespaceTokenizer(component_config) assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [ "Forecast", "for", "LUNCH", ] component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) message = Message("Forecast for LUNCH") tk.process(message) assert message.data.get("tokens")[0].text == "forecast" assert message.data.get("tokens")[1].text == "for" assert message.data.get("tokens")[2].text == "lunch" _config = utilities.base_test_conf("supervised_embeddings") examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }], }, ), ] component_config = {"case_sensitive": False} tk = WhitespaceTokenizer(component_config) tk.train(TrainingData(training_examples=examples), _config) assert examples[0].data.get("tokens")[0].text == "any" assert examples[0].data.get("tokens")[1].text == "mexican" assert examples[0].data.get("tokens")[2].text == "restaurant" assert examples[0].data.get("tokens")[3].text == "will" assert examples[0].data.get("tokens")[4].text == "do" assert examples[1].data.get("tokens")[0].text == "i" assert examples[1].data.get("tokens")[1].text == "want" assert examples[1].data.get("tokens")[2].text == "tacos"