예제 #1
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer

    lookups = [
        {"name": 'drinks', "elements": ["mojito", "lemonade",
                                        "sweet berry wine",
                                        "tea", "club?mate"]},
        {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"}
    ]
    ftr = RegexFeaturizer(lookup_tables=lookups)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert(num_matches == labeled_tokens.count(i))
예제 #2
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer
    patterns = [
        {"pattern": '[0-9]+', "name": "number", "usage": "intent"},
        {"pattern": '\\bhey*', "name": "hello", "usage": "intent"},
        {"pattern": '[0-1]+', "name": "binary", "usage": "intent"}
    ]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert(num_matches == labeled_tokens.count(i))
예제 #3
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer
    patterns = [
        {"pattern": '[0-9]+', "name": "number", "usage": "intent"},
        {"pattern": '\\bhey*', "name": "hello", "usage": "intent"}
    ]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    for i, token in enumerate(message.get("tokens")):
        if i in labeled_tokens:
            assert token.get("pattern") in [0, 1]
        else:
            # if the token is not part of a regex the pattern should not be set
            assert token.get("pattern") is None
예제 #4
0
    def _from_json_to_crf(self,
                          message: Message,
                          entity_offsets: List[Tuple[int, int, Text]]
                          ) -> List[Tuple[Text, Text, Text, Text]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse

            doc = message.get("spacy_doc")
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(tokens, entity_offsets)

        if '-' in ents:
            logger.warning("Misaligned entity annotation in sentence '{}'. "
                           "Make sure the start and end values of the "
                           "annotated training examples end at token "
                           "boundaries (e.g. don't include trailing "
                           "whitespaces or punctuation)."
                           "".format(message.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
예제 #5
0
    def _from_json_to_crf(
        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
    ) -> List[Tuple[Text, Text, Text, Text]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse

            doc = message.get("spacy_doc")
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(tokens, entity_offsets)

        if '-' in ents:
            logger.warning("Misaligned entity annotation in sentence '{}'. "
                           "Make sure the start and end values of the "
                           "annotated training examples end at token "
                           "boundaries (e.g. don't include trailing "
                           "whitespaces or punctuation)."
                           "".format(message.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
예제 #6
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer

    lookups = [{
        "name":
        'drinks',
        "elements":
        ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"]
    }, {
        "name": 'plates',
        "elements": "data/test/lookup_tables/plates.txt"
    }]
    ftr = RegexFeaturizer(lookup_tables=lookups)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert (num_matches == labeled_tokens.count(i))
예제 #7
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa_nlu.featurizers.regex_featurizer import RegexFeaturizer
    patterns = [{
        "pattern": '[0-9]+',
        "name": "number",
        "usage": "intent"
    }, {
        "pattern": '\\bhey*',
        "name": "hello",
        "usage": "intent"
    }, {
        "pattern": '[0-1]+',
        "name": "binary",
        "usage": "intent"
    }]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert (num_matches == labeled_tokens.count(i))
예제 #8
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        ents = self.extract_entities(message.text, message.get("tokens"),
                                     mitie_feature_extractor)
        extracted = self.add_extractor_name(ents)
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #9
0
def test_spacy_ner_extractor(spacy_nlp):
    ext = SpacyEntityExtractor()
    example = Message("anywhere in the West", {
            "intent": "restaurant_search",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the west")})

    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        u'start': 16, u'extractor': u'ner_spacy',
        u'end': 20, u'value': u'West', u'entity': u'LOC'}
예제 #10
0
def test_spacy_ner_extractor(component_builder, spacy_nlp):
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    ext = component_builder.create_component(_config.for_component(0), _config)
    example = Message(
        "anywhere in the West", {
            "intent": "restaurant_search",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the west")
        })
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 16,
        'extractor': 'SpacyEntityExtractor',
        'end': 20,
        'value': 'West',
        'entity': 'LOC',
        'confidence': None
    }

    # Test dimension filtering includes only specified dimensions

    example = Message(
        "anywhere in the West with Sebastian Thrun", {
            "intent": "example_intent",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun")
        })
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    _config.set_component_attr(0, dimensions=["PERSON"])
    ext = component_builder.create_component(_config.for_component(0), _config)
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 26,
        'extractor': 'SpacyEntityExtractor',
        'end': 41,
        'value': 'Sebastian Thrun',
        'entity': 'PERSON',
        'confidence': None
    }
예제 #11
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    train_message.set("intent", "bla")  # this is needed for a valid training example
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #12
0
    def process(self, message: Message, **kwargs: Any) -> None:

        self._check_spacy_doc(message)

        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
def test_duckling_entity_extractor(component_builder):
    _config = utilities.base_test_conf("all_components")
    _config["duckling_dimensions"] = ["time"]
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3

    # Test duckling with a defined date
    message = Message("Let us meet tomorrow.", time="1381536182000")  # 1381536182000 == 2013/10/12 02:03:02
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
    def process(self, message: Message, **kwargs: Any) -> None:
        """Return the most likely intent and its probability for a message."""

        if not self.clf:
            # component is either not trained or didn't
            # receive enough training data
            intent = None
            intent_ranking = []
        else:
            X = message.get("text_features").reshape(1, -1)
            intent_ids, probabilities = self.predict(X)
            intents = self.transform_labels_num2str(np.ravel(intent_ids))
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            probabilities = probabilities.flatten()

            if intents.size > 0 and probabilities.size > 0:
                ranking = list(
                    zip(list(intents),
                        list(probabilities)))[:INTENT_RANKING_LENGTH]

                intent = {"name": intents[0], "confidence": probabilities[0]}

                intent_ranking = [{
                    "name": intent_name,
                    "confidence": score
                } for intent_name, score in ranking]
            else:
                intent = {"name": None, "confidence": 0.0}
                intent_ranking = []

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
예제 #15
0
def test_duckling_entity_extractor(component_builder):
    _config = utilities.base_test_conf("all_components")
    _config["duckling_dimensions"] = ["time"]
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3

    # Test duckling with a defined date
    message = Message("Let us meet tomorrow.", time="1381536182000")  # 1381536182000 == 2013/10/12 02:03:02
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
예제 #16
0
def test_count_vector_featurizer_using_tokens(tokens, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})

    # using empty string instead of real text string to make sure
    # count vector only can come from `tokens` feature.
    # using `message.text` can not get correct result

    tokens_feature = [Token(i, 0) for i in tokens]

    train_message = Message("")
    train_message.set("tokens", tokens_feature)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])

    ftr.train(data)

    test_message = Message("")
    test_message.set("tokens", tokens_feature)

    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #17
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Return the most likely intent and its probability for a message."""

        if not self.clf:
            # component is either not trained or didn't
            # receive enough training data
            intent = None
            intent_ranking = []
        else:
            X = message.get("text_features").reshape(1, -1)
            intent_ids, probabilities = self.predict(X)
            intents = self.transform_labels_num2str(np.ravel(intent_ids))
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            probabilities = probabilities.flatten()

            if intents.size > 0 and probabilities.size > 0:
                ranking = list(zip(list(intents),
                                   list(probabilities)))[:INTENT_RANKING_LENGTH]

                intent = {"name": intents[0], "confidence": probabilities[0]}

                intent_ranking = [{"name": intent_name, "confidence": score}
                                  for intent_name, score in ranking]
            else:
                intent = {"name": None, "confidence": 0.0}
                intent_ranking = []

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
예제 #18
0
    def _from_text_to_crf(self,
                          message: Message,
                          entities: List[Text] = None
                          ) -> List[Tuple[Text, Text, Text, Text]]:
        """Takes a sentence and switches it to crfsuite format."""

        crf_format = []
        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")
        for i, token in enumerate(tokens):
            pattern = self.__pattern_of_token(message, i)
            entity = entities[i] if entities else "N/A"
            tag = self.__tag_of_token(token) if self.pos_features else None
            crf_format.append((token.text, tag, entity, pattern))
        return crf_format
예제 #19
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
        features = self.features_for_tokens(message.get("tokens"),
                                            mitie_feature_extractor)
        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, features))
예제 #20
0
def test_spacy_ner_extractor(spacy_nlp):
    ext = SpacyEntityExtractor()
    example = Message("anywhere in the West", {
        "intent": "restaurant_search",
        "entities": [],
        "spacy_doc": spacy_nlp("anywhere in the west")})

    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 16,
        'extractor': 'ner_spacy',
        'end': 20,
        'value': 'West',
        'entity': 'LOC',
        'confidence': None}
예제 #21
0
def test_duckling_entity_extractor(component_builder):
    _config = utilities.base_test_conf("all_components")
    _config["duckling_dimensions"] = ["time"]
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3
예제 #22
0
 def process(self, message: Message, **kwargs: Any) -> None:
     # can't use the existing doc here (spacy_doc on the message)
     # because tokens are lower cased which is bad for NER
     spacy_nlp = kwargs.get("spacy_nlp", None)
     doc = spacy_nlp(message.text)
     extracted = self.add_extractor_name(self.extract_entities(doc))
     message.set("entities",
                 message.get("entities", []) + extracted,
                 add_to_output=True)
예제 #23
0
def test_duckling_entity_extractor(component_builder):
    _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling"}]})
    _config.set_component_attr("ner_duckling", dimensions=["time"])
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3

    # Test duckling with a defined date

    # 1381536182000 == 2013/10/12 02:03:02
    message = Message("Let us meet tomorrow.", time="1381536182000")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
예제 #24
0
def test_duckling_entity_extractor(component_builder):
    _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling"}]})
    _config.set_component_attr("ner_duckling", dimensions=["time"])
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3

    # Test duckling with a defined date

    # 1381536182000 == 2013/10/12 02:03:02
    message = Message("Let us meet tomorrow.", time="1381536182000")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
예제 #25
0
    def _from_crf_to_json(self, message: Message,
                          entities: List[Any]) -> List[Dict[Text, Any]]:

        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")

        if len(tokens) != len(entities):
            raise Exception('Inconsistency in amount of tokens '
                            'between crfsuite and message')

        if self.component_config["BILOU_flag"]:
            return self._convert_bilou_tagging_to_entity_result(
                tokens, entities)
        else:
            # not using BILOU tagging scheme, multi-word entities are split.
            return self._convert_simple_tagging_to_entity_result(
                tokens, entities)
예제 #26
0
    def _from_crf_to_json(self,
                          message: Message,
                          entities: List[Any]) -> List[Dict[Text, Any]]:

        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")

        if len(tokens) != len(entities):
            raise Exception('Inconsistency in amount of tokens '
                            'between crfsuite and message')

        if self.component_config["BILOU_flag"]:
            return self._convert_bilou_tagging_to_entity_result(
                tokens, entities)
        else:
            # not using BILOU tagging scheme, multi-word entities are split.
            return self._convert_simple_tagging_to_entity_result(
                tokens, entities)
예제 #27
0
def test_cabocha_comps(text, nlp, nlp_doc):
    ext = CabochaTokenizer()

    example = Message(text, {
        "intent": "wish",
        "entities": [],
        "cabocha_doc": nlp_doc
    })

    # tokenizer
    ext.process(example, cabocha=nlp)
    for token in example.get("tokens"):
        print(token.text, token.offset)

    # entity extractor
    ext = CabochaEntityExtractor()
    ext.process(example, cabocha=nlp)
    print("total entities", len(example.get("entities", [])))
    for ent in example.get("entities"):
        print(ent)
예제 #28
0
 def process(self, message: Message, **kwargs: Any) -> None:
     # can't use the existing doc here (spacy_doc on the message)
     # because tokens are lower cased which is bad for NER
     spacy_nlp = kwargs.get("spacy_nlp", None)
     doc = spacy_nlp(message.text)
     all_extracted = self.add_extractor_name(self.extract_entities(doc))
     dimensions = self.component_config["dimensions"]
     extracted = SpacyEntityExtractor.filter_irrelevant_entities(
         all_extracted, dimensions)
     message.set("entities",
                 message.get("entities", []) + extracted,
                 add_to_output=True)
예제 #29
0
def test_spacy_ner_extractor(component_builder, spacy_nlp):
    _config = RasaNLUModelConfig({"pipeline":
                                 [{"name": "SpacyEntityExtractor"}]})
    ext = component_builder.create_component(_config.for_component(0), _config)
    example = Message("anywhere in the West", {
        "intent": "restaurant_search",
        "entities": [],
        "spacy_doc": spacy_nlp("anywhere in the west")})
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 16,
        'extractor': 'SpacyEntityExtractor',
        'end': 20,
        'value': 'West',
        'entity': 'LOC',
        'confidence': None}

    # Test dimension filtering includes only specified dimensions

    example = Message("anywhere in the West with Sebastian Thrun", {
        "intent": "example_intent",
        "entities": [],
        "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun")})
    _config = RasaNLUModelConfig({"pipeline":
                                 [{"name": "SpacyEntityExtractor"}]})
    _config.set_component_attr(0, dimensions=["PERSON"])
    ext = component_builder.create_component(_config.for_component(0), _config)
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 26,
        'extractor': 'SpacyEntityExtractor',
        'end': 41,
        'value': 'Sebastian Thrun',
        'entity': 'PERSON',
        'confidence': None}
예제 #30
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)
예제 #31
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)
예제 #32
0
    def neg_featurize(self, neg_train_data):
        """
        Use the previously trained featurizers in the pipeline to featurize the negative training data (a set of sentences)
        """

        X_neg = []
        for example in neg_train_data:
            m = Message(example)
            self.partially_process(m)
            #print("message: {}; intent: {}".format(example, m.get("intent")))
            X_neg.append(m.get("text_features"))

        X_neg = np.array(X_neg)
        return X_neg
예제 #33
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #34
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #35
0
def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b',
                                  "OOV_token": '__oov__'})
    train_message = Message(sentence)
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
예제 #36
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"min_ngram": 1,
                                  "max_ngram": 2,
                                  "analyzer": 'char'})
    train_message = Message(sentence)
    # this is needed for a valid training example
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
def test_count_vector_featurizer_oov_token(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({
        "token_pattern": r'(?u)\b\w+\b',
        "OOV_token": '__oov__'
    })
    train_message = Message(sentence)
    train_message.set("intent", "bla")
    data = TrainingData([train_message])
    ftr.train(data)

    test_message = Message(sentence)
    ftr.process(test_message)

    assert np.all(test_message.get("text_features") == expected)
    def process(self, message: Message, **kwargs: Any) -> None:

        if self._url() is not None:
            reference_time = self._reference_time_from_message(message)
            matches = self._duckling_parse(message.text, reference_time)
            dimensions = self.component_config["dimensions"]
            relevant_matches = filter_irrelevant_matches(matches, dimensions)
            extracted = convert_duckling_format_to_rasa(relevant_matches)
        else:
            extracted = []
            logger.warning("Duckling HTTP component in pipeline, but no "
                           "`url` configuration in the config "
                           "file nor is `RASA_DUCKLING_HTTP_URL` "
                           "set as an environment variable.")

        extracted = self.add_extractor_name(extracted)
        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #39
0
    def process(self, message: Message, **kwargs: Any) -> None:

        updated_entities = message.get("entities", [])[:]
        self.replace_synonyms(updated_entities)
        message.set("entities", updated_entities, add_to_output=True)
예제 #40
0
def test_duckling_entity_extractor(component_builder):
    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"Today","start":0,"value":{"values":[{
             "value":"2018-11-13T00:00:00.000-08:00","grain":"day",
             "type":"value"}],"value":"2018-11-13T00:00:00.000-08:00",
             "grain":"day","type":"value"},"end":5,
             "dim":"time","latent":false},{"body":"the 5th","start":9,
             "value":{"values":[{
             "value":"2018-12-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},
             {"value":"2019-01-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},
             {"value":"2019-02-05T00:00:00.000-08:00","grain":"day",
             "type":"value"}],
             "value":"2018-12-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},"end":16,"dim":"time",
             "latent":false},{"body":"5th of May","start":13,"value":{
             "values":[{
             "value":"2019-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},
             {"value":"2020-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},
             {"value":"2021-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"}],
             "value":"2019-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},"end":23,"dim":"time",
             "latent":false},{"body":"tomorrow","start":37,"value":{
             "values":[{
             "value":"2018-11-14T00:00:00.000-08:00","grain":"day",
             "type":"value"}],
             "value":"2018-11-14T00:00:00.000-08:00","grain":"day",
             "type":"value"},"end":45,"dim":"time",
             "latent":false}]"""
    )
    httpretty.enable()

    _config = RasaNLUModelConfig(
        {"pipeline": [{"name": "DucklingHTTPExtractor"}]}
    )
    _config.set_component_attr(0, dimensions=["time"], timezone="UTC",
                               url="http://localhost:8000")
    duckling = component_builder.create_component(_config.for_component(0),
                                                  _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 4

    # Test duckling with a defined date

    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"tomorrow","start":12,"value":{"values":[{
             "value":"2013-10-13T00:00:00.000Z","grain":"day",
             "type":"value"}],"value":"2013-10-13T00:00:00.000Z",
             "grain":"day","type":"value"},"end":20,
             "dim":"time","latent":false}]"""
    )

    # 1381536182 == 2013/10/12 02:03:02
    message = Message("Let us meet tomorrow.", time="1381536182")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"

    # Test dimension filtering includes only specified dimensions
    _config = RasaNLUModelConfig(
        {"pipeline": [{"name": "DucklingHTTPExtractor"}]}
    )
    _config.set_component_attr(0, dimensions=["number"],
                               url="http://localhost:8000")
    ducklingNumber = component_builder.create_component(
        _config.for_component(0),
        _config)
    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"Yesterday","start":0,"value":{"values":[{
            "value":"2019-02-28T00:00:00.000+01:00","grain":"day",
            "type":"value"}],"value":"2019-02-28T00:00:00.000+01:00",
            "grain":"day","type":"value"},"end":9,"dim":"time"},
            {"body":"5","start":21,"value":{"value":5,"type":"value"},
            "end":22,"dim":"number"}]"""
    )

    message = Message("Yesterday there were 5 people in a room")
    ducklingNumber.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "5"
    assert entities[0]["value"] == 5
예제 #41
0
    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
        def convert_entity(entity):
            return entity["start"], entity["end"], entity["entity"]

        return [convert_entity(ent) for ent in example.get("entities", [])]
예제 #42
0
def test_duckling_entity_extractor(component_builder):
    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"Today","start":0,"value":{"values":[{
             "value":"2018-11-13T00:00:00.000-08:00","grain":"day",
             "type":"value"}],"value":"2018-11-13T00:00:00.000-08:00",
             "grain":"day","type":"value"},"end":5,
             "dim":"time","latent":false},{"body":"the 5th","start":9,
             "value":{"values":[{
             "value":"2018-12-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},
             {"value":"2019-01-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},
             {"value":"2019-02-05T00:00:00.000-08:00","grain":"day",
             "type":"value"}],
             "value":"2018-12-05T00:00:00.000-08:00","grain":"day",
             "type":"value"},"end":16,"dim":"time",
             "latent":false},{"body":"5th of May","start":13,"value":{
             "values":[{
             "value":"2019-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},
             {"value":"2020-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},
             {"value":"2021-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"}],
             "value":"2019-05-05T00:00:00.000-07:00","grain":"day",
             "type":"value"},"end":23,"dim":"time",
             "latent":false},{"body":"tomorrow","start":37,"value":{
             "values":[{
             "value":"2018-11-14T00:00:00.000-08:00","grain":"day",
             "type":"value"}],
             "value":"2018-11-14T00:00:00.000-08:00","grain":"day",
             "type":"value"},"end":45,"dim":"time",
             "latent":false}]"""
    )
    httpretty.enable()

    _config = RasaNLUModelConfig({"pipeline": [{"name": "ner_duckling_http"}]})
    _config.set_component_attr("ner_duckling_http", dimensions=["time"],
                               timezone="UTC", url="http://localhost:8000")
    duckling = component_builder.create_component("ner_duckling_http", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 4

    # Test duckling with a defined date

    httpretty.register_uri(
        httpretty.POST,
        "http://localhost:8000/parse",
        body="""[{"body":"tomorrow","start":12,"value":{"values":[{
             "value":"2013-10-13T00:00:00.000Z","grain":"day",
             "type":"value"}],"value":"2013-10-13T00:00:00.000Z",
             "grain":"day","type":"value"},"end":20,
             "dim":"time","latent":false}]"""
    )

    # 1381536182 == 2013/10/12 02:03:02
    message = Message("Let us meet tomorrow.", time="1381536182")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 1
    assert entities[0]["text"] == "tomorrow"
    assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
예제 #43
0
    def process(self, message: Message, **kwargs: Any) -> None:

        updated_entities = message.get("entities", [])[:]
        self.replace_synonyms(updated_entities)
        message.set("entities", updated_entities, add_to_output=True)
예제 #44
0
    def process(self, message: Message, **kwargs: Any) -> None:

        message.set("tokens", self.tokenize(message.get("spacy_doc")))
예제 #45
0
    def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:

        def convert_entity(entity):
            return entity["start"], entity["end"], entity["entity"]

        return [convert_entity(ent) for ent in example.get("entities", [])]