def tokenize(self, message: Message, attribute: Text) -> List[Token]: from janome.tokenizer import Tokenizer text = message.get(attribute) text = self.removePunctuation(text) tokenizer = Tokenizer() tokenized = tokenizer.tokenize(text) tokens = [] for token in tokenized: tokens.append(Token(token.node.surface, token.node.pos - 1)) return self._apply_token_pattern(tokens)
def test_convert_featurizer_train(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) message.set(RESPONSE, sentence) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) message.set(TOKENS_NAMES[RESPONSE], tokens) featurizer.train(TrainingData([message]), RasaNLUModelConfig()) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5) vecs = message.get(DENSE_FEATURE_NAMES[INTENT]) assert vecs is None
def test_convert_featurizer_process(): featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) show_message(message) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) assert show_message(message, False) == { "tokens": ["hey", "how", "are", "you", "today", "__CLS__"], "text": "Hey how are you today ?" } featurizer.process(message) show_message(message) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) assert len(tokens) == len(vecs) assert len(vecs) == 6 assert len(tokens) == 6 assert len(vecs[0]) == 1024 assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_component_adds_features(tokenizer, featurizer, msg): """If there are no features we need to add them""" message = Message(msg) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) vectors = message.get(DENSE_FEATURE_NAMES[TEXT]) print(vectors.shape) assert vectors.shape[0] == len(tokens)
def _apply_tokenizer_to_states(tokenizer: Tokenizer, states: List[State]) -> None: """Split each user text into tokens and concatenate them again. Args: tokenizer: A tokenizer to tokenize the user messages. states: The states to be tokenized. """ for state in states: if USER in state and TEXT in state[USER]: state[USER][TEXT] = " ".join( token.text for token in tokenizer.tokenize( Message({TEXT: state[USER][TEXT]}), TEXT))
def test_component_does_not_remove_features(tokenizer, featurizer, msg): """If there are features we need to add not remove them""" message = Message(msg) tokens = tokenizer.tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) first_vectors = message.get(DENSE_FEATURE_NAMES[TEXT]) featurizer.process(message) second_vectors = message.get(DENSE_FEATURE_NAMES[TEXT]) assert (first_vectors.shape[1] * 2) == second_vectors.shape[1]
def test_elmo_featurizer_process(): featurizer = ElmoFeaturizer.create({}, RasaNLUModelConfig()) sentence = "Hey how are you today ?" message = Message(sentence) tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE) message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens) featurizer.process(message) expected = np.array( [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456]) expected_cls = np.array( [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]) vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]) assert len(tokens) == len(vecs) assert np.allclose(vecs[0][:5], expected, atol=1e-5) assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import CountVectorsFeaturizer from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, SPARSE_FEATURE_NAMES logger = logging_setup() featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig()) test_input = "Okay, pick up this yellow banana for me." message = Message(test_input) message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input)) featurizer._set_spacy_features(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) logger.info("SpaCy: {}".format(vecs.shape)) message = Message(test_input) featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig()) MitieTokenizer().process(message) tokens = message.get(TOKENS_NAMES[TEXT]) vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor) logger.info("Mitie: {}".format(vecs.shape)) featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig()) tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT) tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT) message.set(TOKENS_NAMES[TEXT], tokens) featurizer.process(message) vecs = message.get(DENSE_FEATURE_NAMES[TEXT]) logger.info("ConveRT: {}".format(vecs.shape))