示例#1
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        from janome.tokenizer import Tokenizer
        text = message.get(attribute)
        text = self.removePunctuation(text)

        tokenizer = Tokenizer()
        tokenized = tokenizer.tokenize(text)
        tokens = []
        for token in tokenized:
            tokens.append(Token(token.node.surface, token.node.pos - 1))

        return self._apply_token_pattern(tokens)
示例#2
0
def test_convert_featurizer_train():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    message.set(RESPONSE, sentence)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    message.set(TOKENS_NAMES[RESPONSE], tokens)

    featurizer.train(TrainingData([message]), RasaNLUModelConfig())

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[RESPONSE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)

    vecs = message.get(DENSE_FEATURE_NAMES[INTENT])

    assert vecs is None
示例#3
0
def test_convert_featurizer_process():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())
    sentence = "Hey how are you today ?"
    message = Message(sentence)
    show_message(message)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    assert show_message(message, False) == {
        "tokens": ["hey", "how", "are", "you", "today", "__CLS__"],
        "text": "Hey how are you today ?"
    }
    featurizer.process(message)
    show_message(message)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
    assert len(tokens) == len(vecs)
    assert len(vecs) == 6
    assert len(tokens) == 6
    assert len(vecs[0]) == 1024
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
def test_component_adds_features(tokenizer, featurizer, msg):
    """If there are no features we need to add them"""
    message = Message(msg)
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message)
    vectors = message.get(DENSE_FEATURE_NAMES[TEXT])
    print(vectors.shape)
    assert vectors.shape[0] == len(tokens)
示例#5
0
def _apply_tokenizer_to_states(tokenizer: Tokenizer,
                               states: List[State]) -> None:
    """Split each user text into tokens and concatenate them again.

    Args:
        tokenizer: A tokenizer to tokenize the user messages.
        states: The states to be tokenized.
    """
    for state in states:
        if USER in state and TEXT in state[USER]:
            state[USER][TEXT] = " ".join(
                token.text for token in tokenizer.tokenize(
                    Message({TEXT: state[USER][TEXT]}), TEXT))
def test_component_does_not_remove_features(tokenizer, featurizer, msg):
    """If there are features we need to add not remove them"""
    message = Message(msg)
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)
    featurizer.process(message)
    first_vectors = message.get(DENSE_FEATURE_NAMES[TEXT])

    featurizer.process(message)
    second_vectors = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert (first_vectors.shape[1] * 2) == second_vectors.shape[1]
示例#7
0
def test_elmo_featurizer_process():
    featurizer = ElmoFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    tokens = WhitespaceTokenizer().tokenize(message, attribute=TEXT_ATTRIBUTE)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT_ATTRIBUTE)
    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)

    featurizer.process(message)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import CountVectorsFeaturizer

from rasa.nlu.constants import SPACY_DOCS, TEXT, DENSE_FEATURE_NAMES, TOKENS_NAMES, SPARSE_FEATURE_NAMES

logger = logging_setup()

featurizer = SpacyFeaturizer.create({}, RasaNLUModelConfig())

test_input = "Okay, pick up this yellow banana for me."
message = Message(test_input)

message.set(SPACY_DOCS[TEXT], spacy_nlp(test_input))
featurizer._set_spacy_features(message)
vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
logger.info("SpaCy: {}".format(vecs.shape))

message = Message(test_input)
featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig())
MitieTokenizer().process(message)
tokens = message.get(TOKENS_NAMES[TEXT])
vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)
logger.info("Mitie: {}".format(vecs.shape))

featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())
tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
message.set(TOKENS_NAMES[TEXT], tokens)
featurizer.process(message)
vecs = message.get(DENSE_FEATURE_NAMES[TEXT])
logger.info("ConveRT: {}".format(vecs.shape))