def _combine_with_existing_sparse_features( message: Message, additional_features: Any, feature_name: Text = SPARSE_FEATURE_NAMES[TEXT], ) -> Any: if additional_features is None: return if message.get(feature_name) is not None: from scipy.sparse import hstack if message.get( feature_name).shape[0] != additional_features.shape[0]: raise ValueError( f"Cannot concatenate sparse features as sequence dimension does not " f"match: {message.get(feature_name).shape[0]} != " f"{additional_features.shape[0]}. Message: '{message.text}'." ) return hstack([message.get(feature_name), additional_features]) else: return additional_features
def set_fasttext_features(self, message: Message, attribute: Text = TEXT) -> None: tokens = message.get(TOKENS_NAMES[attribute]) if not tokens: return None text_vector = self.model.get_word_vector(message.text) word_vectors = [ self.model.get_word_vector(t.text) for t in train_utils.tokens_without_cls(message, attribute) ] X = np.array(word_vectors + [text_vector]) # remember, we need one for __CLS__ features = self._combine_with_existing_dense_features( message, additional_features=X, feature_name=DENSE_FEATURE_NAMES[attribute]) message.set(DENSE_FEATURE_NAMES[attribute], features)
def _combine_with_existing_dense_features( message: Message, additional_features: Any, feature_name: Text = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE], ) -> Any: if message.get(feature_name) is not None: if len(message.get(feature_name)) != len(additional_features): raise ValueError( f"Cannot concatenate dense features as sequence dimension does not " f"match: {len(message.get(feature_name))} != " f"{len(additional_features)}. " f"Make sure to set 'return_sequence' to the same value for all your " f"featurizers." ) return np.concatenate( (message.get(feature_name), additional_features), axis=-1 ) else: return additional_features
def test_jieba_load_dictionary(tmpdir_factory): dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath component_config = {"dictionary_path": dictionary_path} with patch.object(JiebaTokenizer, "load_custom_dictionary", return_value=None) as mock_method: tk = JiebaTokenizer(component_config) tk.tokenize(Message(""), attribute=TEXT_ATTRIBUTE) mock_method.assert_called_once_with(dictionary_path)
def test_count_vector_featurizer(): sentence, expected, expected_cls = samples[1] ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) show_message(train_message) ftr.train(TrainingData([train_message])) show_message(train_message) ftr.process(test_message) assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix) actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() assert np.all(actual[0] == expected) assert np.all(actual[-1] == expected_cls)
def process(self, message: Message, **kwargs: Any): spans = message.get("spans", []) pronouns = [span for span in spans if span['label'] == 'Pronoun'] coreferences = [] for pronoun in pronouns: ent = self.stag(pronoun, message) if ent: coreferences.append({ "pronoun": { 'start': pronoun['start'], 'end': pronoun['end'] }, "entity": { 'start': ent['start'], 'end': ent['end'] } }) span_output_format(spans) message.set("coreferences", coreferences, add_to_output=True) logging.info("coref data: {}".format(message.data))
def test_text_featurizer_window_size(sentence, expected, expected_cls): featurizer = LexicalSyntacticFeaturizer( {"features": [["upper"], ["digit"], ["low"], ["digit"]]}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) featurizer.train(TrainingData([train_message])) featurizer.process(test_message) assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]), scipy.sparse.coo_matrix) actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray() assert np.all(actual[0] == expected) assert np.all(actual[-1] == expected_cls)
def process(self, message: Message, **kwargs): """Process an incoming message. 判断人物实体的性别 1. 提取人物实体 2. 判读实体性别 """ entities = message.get("entities", []) for ent in entities: if ent['dim'] == 'Nh': """如果是人实体, 则...""" _g = self.n2g.predict(ent['value'])[0] ent.update({'gender': self.int2label[_g]})
def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" if self.vectorizers is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") return attribute = TEXT message_tokens = self._get_processed_message_tokens_by_attribute( message, attribute) # features shape (1, seq, dim) features = self._create_sequence(attribute, [message_tokens]) if features[0] is not None: final_features = Features( features[0], attribute, self.component_config[FEATURIZER_CLASS_ALIAS]) message.add_features(final_features)
def test_count_vector_featurizer(sentence, expected, expected_cls): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) train_message = Message(sentence) test_message = Message(sentence) WhitespaceTokenizer().process(train_message) WhitespaceTokenizer().process(test_message) ftr.train(TrainingData([train_message])) ftr.process(test_message) vecs = test_message.get_sparse_features(TEXT, []) assert isinstance(vecs, scipy.sparse.coo_matrix) actual_vecs = vecs.toarray() assert np.all(actual_vecs[0] == expected) assert np.all(actual_vecs[-1] == expected_cls)
def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]: """Maps the entity tags of the message to the ids of the provided dict. Args: message: the message tag_id_dict: mapping of tags to ids Returns: a list of tag ids """ if message.get(BILOU_ENTITIES): _tags = [ tag_id_dict[_tag] if _tag in tag_id_dict else tag_id_dict[NO_ENTITY_TAG] for _tag in message.get(BILOU_ENTITIES) ] else: _tags = [ tag_id_dict[NO_ENTITY_TAG] for _ in message.get(TOKENS_NAMES[TEXT]) ] return _tags
def test_convert_featurizer_process(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def process(self, message: Message, **kwargs: Any) -> None: mitie_feature_extractor = kwargs.get("mitie_feature_extractor") if not mitie_feature_extractor: raise Exception("Failed to train 'MitieFeaturizer'. " "Missing a proper MITIE feature extractor.") if self.clf: token_strs = self._tokens_of_message(message) intent, confidence = self.clf(token_strs, mitie_feature_extractor) else: # either the model didn't get trained or it wasn't # provided with any data intent = None confidence = 0.0 message.set("intent", { "name": intent, "confidence": confidence }, add_to_output=True)
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp): from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer doc = spacy_nlp(sentence) token_vectors = [t.vector for t in doc] ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig()) greet = {"intent": "greet", "text_features": [0.5]} message = Message(sentence, greet) message.set(SPACY_DOCS[TEXT], doc) ftr._set_spacy_features(message) seq_vecs, sen_vecs = message.get_dense_features(TEXT, []) vecs = seq_vecs[0][:5] assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4) assert np.allclose(vecs, expected, atol=1e-4) assert sen_vecs is not None
def _from_crf_to_json( self, message: Message, entities: List[Any] ) -> List[Dict[Text, Any]]: if self.pos_features: tokens = message.get("spacy_doc") else: tokens = message.get("tokens") if len(tokens) != len(entities): raise Exception( "Inconsistency in amount of tokens between crfsuite and message" ) if self.component_config["BILOU_flag"]: return self._convert_bilou_tagging_to_entity_result( message, tokens, entities ) else: # not using BILOU tagging scheme, multi-word entities are split. return self._convert_simple_tagging_to_entity_result(tokens, entities)
def _read_intent(self, intent_js, examples_js): """Reads the intent and examples from respective jsons.""" from rasa.nlu.training_data import Message, TrainingData intent = intent_js.get("name") training_examples = [] for ex in examples_js: text, entities = self._join_text_chunks(ex["data"]) training_examples.append(Message.build(text, intent, entities)) return TrainingData(training_examples)
def test_mitie_featurizer(mitie_feature_extractor): featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today" message = Message(sentence) MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) expected = np.array([ 0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00 ]) expected_cls = np.array( [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751]) assert 6 == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]: """Maps the entities of the given message to their start, end, and tag values. Args: message: the message Returns: a list of start, end, and tag value tuples """ def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]: return entity["start"], entity["end"], entity["entity"] return [convert_entity(entity) for entity in message.get(ENTITIES, [])]
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer lookups = [ { "name": "drinks", "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"], }, { "name": "plates", "elements": "data/test/lookup_tables/plates.txt" }, ] ftr = RegexFeaturizer() ftr.add_lookup_tables(lookups) # adds tokens to the message component_config = {"name": "SpacyTokenizer"} tokenizer = SpacyTokenizer(component_config) message = Message(sentence) message.set("text_spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr._features_for_patterns(message, TEXT) assert np.allclose(result.toarray(), expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_train_tokenizer(text, expected_tokens, expected_indices): tk = WhitespaceTokenizer() message = Message(text) message.set(RESPONSE_ATTRIBUTE, text) message.set(INTENT_ATTRIBUTE, text) training_data = TrainingData() training_data.training_examples = [message] tk.train(training_data) for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices] # check intent attribute tokens = training_data.training_examples[0].get( TOKENS_NAMES[INTENT_ATTRIBUTE]) assert [t.text for t in tokens] == [text]
def test_crf_use_dense_features(spacy_nlp): crf_extractor = CRFEntityExtractor( component_config={ "features": [ ["low", "title", "upper", "pos", "pos2"], [ "low", "suffix3", "suffix2", "upper", "title", "digit", "pos", "pos2", "text_dense_features", ], ["low", "title", "upper", "pos", "pos2"], ] } ) spacy_featurizer = SpacyFeaturizer() spacy_tokenizer = SpacyTokenizer() text = "Rasa is a company in Berlin" message = Message(text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) spacy_tokenizer.process(message) spacy_featurizer.process(message) text_data = crf_extractor._from_text_to_crf(message) features = crf_extractor._sentence_to_features(text_data) assert "0:text_dense_features" in features[0] for i in range(0, len(message.data.get("text_dense_features")[0])): assert ( features[0]["0:text_dense_features"]["text_dense_features"][str(i)] == message.data.get("text_dense_features")[0][i] )
def _from_json_to_crf( self, message: Message, entity_offsets: List[Tuple[int, int, Text]] ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any]]]: """Convert json examples to format of underlying crfsuite.""" if self.pos_features: from spacy.gold import GoldParse # pytype: disable=import-error doc_or_tokens = message.get("spacy_doc") gold = GoldParse(doc_or_tokens, entities=entity_offsets) ents = [l[5] for l in gold.orig_annot] else: doc_or_tokens = message.get("tokens") ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets) # collect badly annotated examples collected = [] for t, e in zip(doc_or_tokens, ents): if e == "-": collected.append(t) elif collected: collected_text = " ".join([t.text for t in collected]) logger.warning( "Misaligned entity annotation for '{}' " "in sentence '{}' with intent '{}'. " "Make sure the start and end values of the " "annotated training examples end at token " "boundaries (e.g. don't include trailing " "whitespaces or punctuation)." "".format(collected_text, message.text, message.get("intent")) ) collected = [] if not self.component_config["BILOU_flag"]: for i, label in enumerate(ents): if self._bilou_from_label(label) in {"B", "I", "U", "L"}: # removes BILOU prefix from label ents[i] = self._entity_from_label(label) return self._from_text_to_crf(message, ents)
async def test_get_nlu_data(Faker: asynctest.MagicMock, load_data: asynctest.MagicMock) -> None: faker_ = Faker() faker_.name.return_value = "Nikola Tesla" training_data = TrainingData( training_examples=[ Message.build("hello", "intent_test"), Message.build("hello @name", "intent_test"), Message.build("hello"), ] ) load_data.return_value = training_data importer = PlaceholderImporter() importer.config = {"importers": [{"name": "rasam.PlaceholderImporter"}]} importer._nlu_files = ["test"] new_training_data = await importer.get_nlu_data() faker_.seed_instance.assert_called_once_with(importer.DEFAULT_FAKE_DATA_COUNT) load_data.assert_called_once_with("test", "en") message: Message expected_messages = [ Message.build("hello", "intent_test"), Message.build("hello Nikola Tesla", "intent_test"), Message.build("hello"), ] for message, expected in zip(new_training_data.training_examples, expected_messages): assert message.get("intent") == expected.get("intent") assert message.get("text") == expected.get("text")
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer patterns = [ {"pattern": "[0-9]+", "name": "number", "usage": "intent"}, {"pattern": "\\bhey*", "name": "hello", "usage": "intent"}, {"pattern": "[0-1]+", "name": "binary", "usage": "intent"}, ] ftr = RegexFeaturizer(known_patterns=patterns) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp): from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer lookups = [ {"name": 'drinks', "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]}, {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"} ] ftr = RegexFeaturizer(lookup_tables=lookups) # adds tokens to the message tokenizer = SpacyTokenizer() message = Message(sentence) message.set("spacy_doc", spacy_nlp(sentence)) tokenizer.process(message) result = ftr.features_for_patterns(message) assert np.allclose(result, expected, atol=1e-10) # the tokenizer should have added tokens assert len(message.get("tokens", [])) > 0 # the number of regex matches on each token should match for i, token in enumerate(message.get("tokens")): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert(num_matches == labeled_tokens.count(i))
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import mitie text = message.get(attribute) encoded_sentence = text.encode(DEFAULT_ENCODING) tokenized = mitie.tokenize_with_offsets(encoded_sentence) tokens = [ self._token_from_offset(token, offset, encoded_sentence) for token, offset in tokenized ] return self._apply_token_pattern(tokens)
def process(self, message: Message, **kwargs: Any) -> None: """Process incoming message and compute and set features""" if self.vectorizers is None: logger.error("There is no trained CountVectorizer: " "component is either not trained or " "didn't receive enough training data") else: message_text = self._get_message_text_by_attribute( message, attribute=MESSAGE_TEXT_ATTRIBUTE) bag = (self.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform( [message_text]).toarray().squeeze()) message.set( MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], self._combine_with_existing_features( message, bag, feature_name=MESSAGE_VECTOR_FEATURE_NAMES[ MESSAGE_TEXT_ATTRIBUTE], ), )
def test_convert_featurizer_process(component_builder): tokenizer = component_builder.create_component_from_class(ConveRTTokenizer) featurizer = component_builder.create_component_from_class(ConveRTFeaturizer) sentence = "Hey how are you today ?" message = Message(sentence) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message, tf_hub_module=tokenizer.module) expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353] ) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def _get_tags(self, message: Message) -> Dict[Text, List[Text]]: """Get assigned entity tags of message.""" tokens = train_utils.tokens_without_cls(message) tags = {} for tag_name in self.crf_order: if self.component_config[BILOU_FLAG]: bilou_key = bilou_utils.get_bilou_key_for_tag(tag_name) if message.get(bilou_key): _tags = message.get(bilou_key) else: _tags = [NO_ENTITY_TAG for _ in tokens] else: _tags = [ determine_token_labels(token, message.get(ENTITIES), attribute_key=tag_name) for token in tokens ] tags[tag_name] = _tags return tags
def test_spacy_ner_extractor(component_builder, spacy_nlp): _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) ext = component_builder.create_component(_config.for_component(0), _config) example = Message( "anywhere in the West", { "intent": "restaurant_search", "entities": [], "spacy_doc": spacy_nlp("anywhere in the west") }) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 16, 'extractor': 'SpacyEntityExtractor', 'end': 20, 'value': 'West', 'entity': 'LOC', 'confidence': None } # Test dimension filtering includes only specified dimensions example = Message( "anywhere in the West with Sebastian Thrun", { "intent": "example_intent", "entities": [], "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun") }) _config = RasaNLUModelConfig( {"pipeline": [{ "name": "SpacyEntityExtractor" }]}) _config.set_component_attr(0, dimensions=["PERSON"]) ext = component_builder.create_component(_config.for_component(0), _config) ext.process(example, spacy_nlp=spacy_nlp) assert len(example.get("entities", [])) == 1 assert example.get("entities")[0] == { 'start': 26, 'extractor': 'SpacyEntityExtractor', 'end': 41, 'value': 'Sebastian Thrun', 'entity': 'PERSON', 'confidence': None }