def test_whitespace_training(supervised_embeddings_config): examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [ {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"} ], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [ {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"} ], }, ), ] tk = WhitespaceTokenizer() tk.train(TrainingData(training_examples=examples), supervised_embeddings_config) assert examples[0].data.get(TOKENS_NAMES[TEXT])[0].text == "Any" assert examples[0].data.get(TOKENS_NAMES[TEXT])[1].text == "Mexican" assert examples[0].data.get(TOKENS_NAMES[TEXT])[2].text == "restaurant" assert examples[0].data.get(TOKENS_NAMES[TEXT])[3].text == "will" assert examples[0].data.get(TOKENS_NAMES[TEXT])[4].text == "do" assert examples[1].data.get(TOKENS_NAMES[TEXT])[0].text == "I" assert examples[1].data.get(TOKENS_NAMES[TEXT])[1].text == "want" assert examples[1].data.get(TOKENS_NAMES[TEXT])[2].text == "Tacos"
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_process( text: Text, lookup: List[Dict[Text, List[Text]]], expected_entities: List[Dict[Text, Any]], ): message = Message(text) training_data = TrainingData() training_data.lookup_tables = lookup training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == expected_entities
def test_count_vector_featurizer_response_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) # add a second example that has some response, so that the vocabulary for # response exists second_message = Message("hello") second_message.set(RESPONSE, "hi") second_message.set(INTENT, "greet") data = TrainingData([train_message, second_message]) tk.train(data) ftr.train(data) if intent_features: assert (train_message.get( SPARSE_FEATURE_NAMES[INTENT]).toarray()[0] == intent_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[INTENT]) is None if response_features: assert (train_message.get( SPARSE_FEATURE_NAMES[RESPONSE]).toarray()[0] == response_features) else: assert train_message.get(SPARSE_FEATURE_NAMES[RESPONSE]) is None
def test_unintentional_synonyms_capitalized(component_builder): _config = utilities.base_test_conf("pretrained_embeddings_spacy") ner_syn = component_builder.create_component(_config.for_component(5), _config) examples = [ Message( "Any Mexican restaurant will do", { "intent": "restaurant_search", "entities": [{ "start": 4, "end": 11, "value": "Mexican", "entity": "cuisine" }], }, ), Message( "I want Tacos!", { "intent": "restaurant_search", "entities": [{ "start": 7, "end": 12, "value": "Mexican", "entity": "cuisine" }], }, ), ] ner_syn.train(TrainingData(training_examples=examples), _config) assert ner_syn.synonyms.get("mexican") is None assert ner_syn.synonyms.get("tacos") == "Mexican"
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected)
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_text_featurizer(sentence, expected_features): featurizer = LexicalSyntacticFeaturizer({ "features": [ ["BOS", "upper"], ["BOS", "EOS", "prefix2", "digit"], ["EOS", "low"], ] }) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) seq_vec, sen_vec = test_message.get_sparse_features(TEXT, []) assert isinstance(seq_vec, scipy.sparse.coo_matrix) assert sen_vec is None assert np.all(seq_vec.toarray() == expected_features[:-1])
def test_text_featurizer(sentence, expected_features): featurizer = LexicalSyntacticFeaturizer({ "features": [ ["BOS", "upper"], ["BOS", "EOS", "prefix2", "digit"], ["EOS", "low"], ] }) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix) actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() assert np.all(actual == expected_features)
def test_build_tag_id_dict(): message_1 = Message("Germany is part of the European Union") message_1.set( BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "B-organisation", "L-organisation"], ) message_2 = Message("Berlin is the capital of Germany") message_2.set(BILOU_ENTITIES, ["U-location", "O", "O", "O", "O", "U-location"]) training_data = TrainingData([message_1, message_2]) tag_id_dict = bilou_utils.build_tag_id_dict(training_data) assert tag_id_dict == { "O": 0, "B-location": 1, "I-location": 2, "U-location": 3, "L-location": 4, "B-organisation": 5, "I-organisation": 6, "U-organisation": 7, "L-organisation": 8, }
def test_spacy_ner_extractor(component_builder, spacy_nlp): _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) ext = component_builder.create_component(_config.for_component(0), _config) example = Message( "anywhere in the U.K.", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west"), }, ) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { "start": 16, "extractor": "SpacyEntityExtractor", "end": 20, "value": "U.K.", "entity": "GPE", "confidence": None, } # Test dimension filtering includes only specified dimensions example = Message( "anywhere in the West with Sebastian Thrun", { "intent": "example_intent", "entities": [], "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"), }, ) _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) _config.set_component_attr(0, dimensions=["PERSON"]) ext = component_builder.create_component(_config.for_component(0), _config) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { "start": 26, "extractor": "SpacyEntityExtractor", "end": 41, "value": "Sebastian Thrun", "entity": "PERSON", "confidence": None, }
def test_count_vector_featurizer_persist_load(tmpdir): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer # set non default values to config config = { "analyzer": "char", "token_pattern": r"(?u)\b\w+\b", "strip_accents": "ascii", "stop_words": "stop", "min_df": 2, "max_df": 3, "min_ngram": 2, "max_ngram": 3, "max_features": 10, "lowercase": False, } train_ftr = CountVectorsFeaturizer(config) sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà" sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà" train_message1 = Message(sentence1) train_message2 = Message(sentence2) # this is needed for a valid training example train_message1.set("intent", "bla") train_message2.set("intent", "bla") data = TrainingData([train_message1, train_message2]) train_ftr.train(data) # persist featurizer file_dict = train_ftr.persist("ftr", tmpdir.strpath) train_vect_params = train_ftr.vectorizer.get_params() # add trained vocabulary to vectorizer params train_vect_params.update({"vocabulary": train_ftr.vectorizer.vocabulary_}) # load featurizer meta = train_ftr.component_config.copy() meta.update(file_dict) test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath) test_vect_params = test_ftr.vectorizer.get_params() assert train_vect_params == test_vect_params test_message1 = Message(sentence1) test_ftr.process(test_message1) test_message2 = Message(sentence2) test_ftr.process(test_message2) # check that train features and test features after loading are the same assert np.all([ train_message1.get("text_features") == test_message1.get( "text_features"), train_message2.get("text_features") == test_message2.get( "text_features"), ])
def test_apply_bilou_schema(): tokenizer = WhitespaceTokenizer() message_1 = Message("Germany is part of the European Union") message_1.set( ENTITIES, [ {"start": 0, "end": 7, "value": "Germany", "entity": "location"}, { "start": 23, "end": 37, "value": "European Union", "entity": "organisation", }, ], ) message_2 = Message("Berlin is the capital of Germany") message_2.set( ENTITIES, [ {"start": 0, "end": 6, "value": "Berlin", "entity": "location"}, {"start": 25, "end": 32, "value": "Germany", "entity": "location"}, ], ) training_data = TrainingData([message_1, message_2]) tokenizer.train(training_data) bilou_utils.apply_bilou_schema(training_data) assert message_1.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "B-organisation", "L-organisation", "O", ] assert message_2.get(BILOU_ENTITIES) == [ "U-location", "O", "O", "O", "O", "U-location", "O", ]
def test_count_vector_featurizer_char(sentence, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_do_not_overwrite_any_entities(): message = Message("Max lives in Berlin.") message.set(ENTITIES, [{ "entity": "person", "value": "Max", "start": 0, "end": 3 }]) training_data = TrainingData() training_data.training_examples = [ Message("Hi Max!", data={"entities": [{ "entity": "person", "value": "Max" }]}), Message( "I live in Berlin", data={"entities": [{ "entity": "city", "value": "Berlin" }]}, ), ] training_data.lookup_tables = [{ "name": "city", "elements": ["London", "Berlin", "Amsterdam"] }] entity_extractor = RegexEntityExtractor() entity_extractor.train(training_data) entity_extractor.process(message) entities = message.get(ENTITIES) assert entities == [ { "entity": "person", "value": "Max", "start": 0, "end": 3 }, { "entity": "city", "value": "Berlin", "start": 13, "end": 19, "extractor": "RegexEntityExtractor", }, ]
def test_count_vector_featurizer_oov_token(sentence, expected): ftr = CountVectorsFeaturizer({"OOV_token": "__oov__"}) train_message = Message(sentence) WhitespaceTokenizer().process(train_message) data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer(sentence, expected): from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message(sentence) ftr.process(test_message) assert np.all(test_message.get("text_features") == expected)
def test_spacy_ner_extractor(component_builder, spacy_nlp): _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) ext = component_builder.create_component(_config.for_component(0), _config) example = Message( "anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west") }) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 16, 'extractor': 'SpacyEntityExtractor', 'end': 20, 'value': 'West', 'entity': 'LOC', 'confidence': None } # Test dimension filtering includes only specified dimensions example = Message( "anywhere in the West with Sebastian Thrun", { "intent": "example_intent", "entities": [], "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun") }) _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) _config.set_component_attr(0, dimensions=["PERSON"]) ext = component_builder.create_component(_config.for_component(0), _config) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 26, 'extractor': 'SpacyEntityExtractor', 'end': 41, 'value': 'Sebastian Thrun', 'entity': 'PERSON', 'confidence': None }
def parse( self, text: Text, time: Optional[datetime.datetime] = None, only_output_properties: bool = True, ) -> Dict[Text, Any]: """Parse the input text, classify it and return pipeline result. The pipeline result usually contains intent and entities.""" if not text: # Not all components are able to handle empty strings. So we need # to prevent that... This default return will not contain all # output attributes of all components, but in the end, no one # should pass an empty string in the first place. output = self.default_output_attributes() output["text"] = "" return output message = Message(text, self.default_output_attributes(), time=time) for component in self.pipeline: component.process(message, **self.context) output = self.default_output_attributes() output.update(message.as_dict(only_output_properties=only_output_properties)) return output
def test_count_vectors_featurizer_train(): featurizer = CountVectorsFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) message.set(INTENT, "intent") WhitespaceTokenizer().train(TrainingData([message])) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([0, 1, 0, 0, 0]) expected_cls = np.array([1, 1, 1, 1, 1]) seq_vec, sen_vec = message.get_sparse_features(TEXT, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(RESPONSE, []) assert (5, 5) == seq_vec.shape assert (1, 5) == sen_vec.shape assert np.all(seq_vec.toarray()[0] == expected) assert np.all(sen_vec.toarray()[-1] == expected_cls) seq_vec, sen_vec = message.get_sparse_features(INTENT, []) assert sen_vec is None assert (1, 1) == seq_vec.shape assert np.all(seq_vec.toarray()[0] == np.array([1]))
def test_count_vector_featurizer_attribute_featurization( sentence, intent, response, intent_features, response_features): ftr = CountVectorsFeaturizer() tk = WhitespaceTokenizer() train_message = Message(sentence) # this is needed for a valid training example train_message.set(INTENT, intent) train_message.set(RESPONSE, response) data = TrainingData([train_message]) tk.train(data) ftr.train(data) intent_seq_vecs, intent_sen_vecs = train_message.get_sparse_features( INTENT, []) response_seq_vecs, response_sen_vecs = train_message.get_sparse_features( RESPONSE, []) if intent_features: assert intent_seq_vecs.toarray()[0] == intent_features assert intent_sen_vecs is None else: assert intent_seq_vecs is None assert intent_sen_vecs is None if response_features: assert response_seq_vecs.toarray()[0] == response_features assert response_sen_vecs is not None else: assert response_seq_vecs is None assert response_sen_vecs is None
def filter_trainable_entities( self, entity_examples: List[Message]) -> List[Message]: """Filters out untrainable entity annotations. Creates a copy of entity_examples in which entities that have `extractor` set to something other than self.name (e.g. 'CRFEntityExtractor') are removed. """ filtered = [] for message in entity_examples: entities = [] for ent in message.get(ENTITIES, []): extractor = ent.get(EXTRACTOR) if not extractor or extractor == self.name: entities.append(ent) data = message.data.copy() data[ENTITIES] = entities filtered.append( Message( text=message.text, data=data, output_properties=message.output_properties, time=message.time, )) return filtered
def test_convert_featurizer_train(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_convert_featurizer_process(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) show_message(message) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) assert show_message(message, False) == { "tokens": ["hey", "how", "are", "you", "today", "__CLS__"], "text": "Hey how are you today ?" } featurizer.process(message) show_message(message) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert len(vecs) == 6 assert len(tokens) == 6 assert len(vecs[0]) == 1024 assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_regex_featurizer_no_sequence(sentence, expected, expected_cls, spacy_nlp): patterns = [ { "pattern": "[0-9]+", "name": "number", "usage": "intent" }, { "pattern": "\\bhey*", "name": "hello", "usage": "intent" }, { "pattern": "[0-1]+", "name": "binary", "usage": "intent" }, ] ftr = RegexFeaturizer({}, known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set(SPACY_DOCS[TEXT], spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray()[0], expected, atol=1e-10) assert np.allclose(result.toarray()[-1], expected_cls, atol=1e-10)
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() ftr.add_lookup_tables(lookups) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_compute_default_label_features(): label_features = [ Message("test a"), Message("test b"), Message("test c"), Message("test d"), ] output = DIETClassifier._compute_default_label_features(label_features) output = output[0] for i, o in enumerate(output): assert isinstance(o, np.ndarray) assert o[0][i] == 1 assert o.shape == (1, len(label_features))
def test_spacy_featurizer_train(spacy_nlp): featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) message.set(RESPONSE_ATTRIBUTE, sentence) message.set(INTENT_ATTRIBUTE, "intent") message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence)) message.set(SPACY_DOCS[RESPONSE_ATTRIBUTE], spacy_nlp(sentence)) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array([-0.28451, 0.31007, -0.57039, -0.073056, -0.17322]) expected_cls = np.array( [-0.196496, 0.3249364, -0.37408298, -0.10622784, 0.062756]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE_ATTRIBUTE]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]) assert vecs is None
def test_train_tokenizer(text, expected_tokens, expected_indices): tk = WhitespaceTokenizer() message = Message(text) message.set(RESPONSE_ATTRIBUTE, text) message.set(INTENT_ATTRIBUTE, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] # check intent attribute tokens = training_data.training_examples[0].get( TOKENS_NAMES[INTENT_ATTRIBUTE]) assert [t.text for t in tokens] == [text]
def test_convert_featurizer_tokens_to_text(component_builder, sentence, expected_text): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) tokens = tokenizer.tokenize(Message(sentence), attribute=TEXT) actual_text = ConveRTFeaturizer._tokens_to_text([tokens])[0] assert expected_text == actual_text