예제 #1
0
    def process(self, message: Message, **kwargs: Any) -> None:

        self._check_spacy_doc(message)

        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #2
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer
    patterns = [
        {"pattern": '[0-9]+', "name": "number", "usage": "intent"},
        {"pattern": '\\bhey*', "name": "hello", "usage": "intent"}
    ]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    for i, token in enumerate(message.get("tokens")):
        if i in labeled_tokens:
            assert token.get("pattern") in [0, 1]
        else:
            # if the token is not part of a regex the pattern should not be set
            assert token.get("pattern") is None
    def process(self, message: Message, **kwargs: Any) -> None:
        """Return the most likely intent and its probability for a message."""

        if not self.clf:
            # component is either not trained or didn't
            # receive enough training data
            intent = None
            intent_ranking = []
        else:
            X = message.get("text_features").reshape(1, -1)
            intent_ids, probabilities = self.predict(X)
            intents = self.transform_labels_num2str(np.ravel(intent_ids))
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            probabilities = probabilities.flatten()

            if intents.size > 0 and probabilities.size > 0:
                ranking = list(zip(list(intents),
                                   list(probabilities)))[:INTENT_RANKING_LENGTH]

                intent = {"name": intents[0], "confidence": probabilities[0]}

                intent_ranking = [{"name": intent_name, "confidence": score}
                                  for intent_name, score in ranking]
            else:
                intent = {"name": None, "confidence": 0.0}
                intent_ranking = []

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
예제 #4
0
    def _parse_intent_example(self, example_in_md):
        entities = []
        utter = example_in_md
        for regex in [ent_regex, ent_regex_with_value]:
            utter = re.sub(regex, r"\1", utter)  # [text](entity) -> text
            ent_matches = re.finditer(regex, example_in_md)
            for matchNum, match in enumerate(ent_matches):
                if 'synonym' in match.groupdict():
                    entity_value_in_utter = match.groupdict()['synonym']
                else:
                    entity_value_in_utter = match.groupdict()['value']

                start_index = utter.index(entity_value_in_utter)
                end_index = start_index + len(entity_value_in_utter)

                entities.append({
                    'entity': match.groupdict()['entity'],
                    'value': match.groupdict()['value'],
                    'start': start_index,
                    'end': end_index
                })

        message = Message(utter, {'intent': self.current_intent})
        if len(entities) > 0:
            message.set('entities', entities)
        return message
예제 #5
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer
    patterns = [
        {"pattern": '[0-9]+', "name": "number", "usage": "intent"},
        {"pattern": '\\bhey*', "name": "hello", "usage": "intent"},
        {"pattern": '[0-1]+', "name": "binary", "usage": "intent"}
    ]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert(num_matches == labeled_tokens.count(i))
예제 #6
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer

    lookups = [
        {"name": 'drinks', "elements": ["mojito", "lemonade",
                                        "sweet berry wine",
                                        "tea", "club?mate"]},
        {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"}
    ]
    ftr = RegexFeaturizer(lookup_tables=lookups)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert(num_matches == labeled_tokens.count(i))
예제 #7
0
 def _parse_training_example(self, example):
     """Extract entities and synonyms, and convert to plain text."""
     entities = self._find_entities_in_training_example(example)
     plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'], example)
     self._add_synonyms(plain_text, entities)
     message = Message(plain_text, {'intent': self.current_title})
     if len(entities) > 0:
         message.set('entities', entities)
     return message
예제 #8
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        ents = self.extract_entities(message.text, message.get("tokens"),
                                     mitie_feature_extractor)
        extracted = self.add_extractor_name(ents)
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #9
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)
예제 #10
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #11
0
def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b',
                                  "OOV_token": '__oov__'})
    train_message = Message(sentence)
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #12
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1,
                                  "max_ngram": 2,
                                  "analyzer": 'char'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #13
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        if self.clf:
            token_strs = self._tokens_of_message(message)
            intent, confidence = self.clf(token_strs, mitie_feature_extractor)
        else:
            # either the model didn't get trained or it wasn't
            # provided with any data
            intent = None
            confidence = 0.0

        message.set("intent", {"name": intent, "confidence": confidence},
                    add_to_output=True)
예제 #14
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    train_message.set("intent", "bla")  # this is needed for a valid training example
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #15
0
    def process(self, message: Message, **kwargs: Any) -> None:

        message.set("tokens", self.tokenize(message.get("spacy_doc")))
    def process(self, message: Message, **kwargs: Any) -> None:

        updated_entities = message.get("entities", [])[:]
        self.replace_synonyms(updated_entities)
        message.set("entities", updated_entities, add_to_output=True)
예제 #17
0
    def process(self, message: Message, **kwargs: Any) -> None:

        updated = self._text_features_with_regex(message)
        message.set("text_features", updated)
예제 #18
0
    def process(self, message: Message, **kwargs: Any) -> None:

        message.set("spacy_doc", self.doc_for_text(message.text))
    def process(self, message: Message, **kwargs: Any) -> None:

        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #20
0
    def process(self, message: Message, **kwargs: Any) -> None:

        message.set("spacy_doc", self.doc_for_text(message.text))
예제 #21
0
 def process(self, message: Message, **kwargs: Any) -> None:
     message.set("tokens", self.tokenize(message.text))
예제 #22
0
    def process(self, message: Message, **kwargs: Any) -> None:

        updated_entities = message.get("entities", [])[:]
        self.replace_synonyms(updated_entities)
        message.set("entities", updated_entities, add_to_output=True)
예제 #23
0
    def process(self, message: Message, **kwargs: Any) -> None:

        updated = self._text_features_with_regex(message)
        message.set("text_features", updated)
예제 #24
0
    def process(self, message: Message, **kwargs: Any) -> None:

        intent = {"name": self.parse(message.text), "confidence": 1.0}
        message.set("intent", intent,
                    add_to_output=True)
예제 #25
0
 def process(self, message: Message, **kwargs: Any) -> None:
     spacy_nlp = kwargs.get("spacy_nlp", None)
     message.text = lemmatize(lemmatize(message.text))
     message.set("spacy_doc", self.doc_for_text(message.text, spacy_nlp))
예제 #26
0
    def process(self, message: Message, **kwargs: Any) -> None:

        message.set("tokens", self.tokenize(message.text))
예제 #27
0
    def process(self, message: Message, **kwargs: Any):

        updated = self._text_features_with_ngrams(message, self.best_num_ngrams)
        message.set("text_features", updated)
예제 #28
0
    def process(self, message: Message, **kwargs: Any) -> None:

        message.set("tokens", self.tokenize(message.get("spacy_doc")))
    def process(self, message: Message, **kwargs: Any) -> None:

        intent = {"name": self.parse(message.text), "confidence": 1.0}
        message.set("intent", intent, add_to_output=True)