Пример #1
0
 def process(self, message: Message, **kwargs: Any) -> None:
     entities = self.extract_entities(message)
     entities = self.add_extractor_name(entities)
     message.set(ENTITIES, message.get(ENTITIES, []) + entities, add_to_output=True)
Пример #2
0
 def _tokens_of_message(message: Message) -> List[Text]:
     return [token.text for token in message.get(TOKENS_NAMES[TEXT], [])]
Пример #3
0
 def process(self, message: Message, **kwargs: Any) -> None:
     for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
         if message.get(attribute):
             message.set(SPACY_DOCS[attribute],
                         self.doc_for_text(message.get(attribute)))
Пример #4
0
    def process(self, message: Message, **kwargs: Any) -> None:

        updated_entities = message.get(ENTITIES, [])[:]
        self.replace_synonyms(updated_entities)
        message.set(ENTITIES, updated_entities, add_to_output=True)
Пример #5
0
 def tokenize(self, message: Message, attribute: Text) -> List[Token]:
     text = message.get(attribute)
     words = word_tokenize(text)
     # words = ViTokenizer.tokenize(text).split(' ')
     return self._convert_words_to_tokens(words, text)
Пример #6
0
 def get_doc(self, message: Message, attribute: Text) -> Any:
     return message.get(SPACY_DOCS[attribute])
Пример #7
0
    def compute_similarity_for_pair(self, a: Message, b: Message):
        features_a = a.get("sentence_features").vector
        features_b = b.get("sentence_features").vector

        return self.compute_similarity_score(features_a, features_b)
Пример #8
0
 def _get_doc(self, message: Message, attribute: Text) -> Optional["Doc"]:
     return message.get(SPACY_DOCS[attribute])
Пример #9
0
    def _unpack(self, message: Message, domain: Domain) -> Message:
        """Unpacks the messsage if `TEXT` contains an encoding of attributes.

        Args:
            message: some message
            domain: the domain
        Returns:
            the given message if that message does not need to be unpacked, and a new
            message with the extracted attributes otherwise
        """
        user_text = message.get(TEXT).strip()

        # If the prefix doesn't match, we don't even need to try to match the pattern.
        if not user_text.startswith(self._prefix):
            return message

        # Try to match the pattern.
        match = self._pattern.match(user_text)

        # If it doesn't match, then (potentially) something went wrong, because the
        # message text did start with the special prefix -- however, a user might
        # just have decided to start their text this way.
        if not match:
            logger.warning(
                f"Failed to parse intent end entities from '{user_text}'.")
            return message

        # Extract attributes from the match - and validate it via the domain.
        intent_name = self._parse_intent_name(match, domain)
        confidence = self._parse_optional_confidences(match)
        entities = self._parse_optional_entities(match, domain)

        # The intent name is *not* optional, but during parsing we might find out
        # that the given intent is unknown (and warn). In this case, stop here.
        if intent_name is None:
            return message

        if match.group("rest"):
            rasa.shared.utils.io.raise_warning(
                f"Failed to parse arguments in line '{match.string}'. "
                f"Failed to interpret some parts. "
                f"Continuing without {match.group('rest')}. ",
                docs=DOCS_URL_STORIES,
            )

        # Add the results to the message.
        intent_data = {
            INTENT_NAME_KEY: intent_name,
            PREDICTED_CONFIDENCE_KEY: confidence,
        }
        intent_ranking = [{
            INTENT_NAME_KEY: intent_name,
            PREDICTED_CONFIDENCE_KEY: confidence,
        }]
        message_data = {}
        message_data[TEXT] = user_text
        message_data[INTENT] = intent_data
        message_data[INTENT_RANKING_KEY] = intent_ranking
        message_data[ENTITIES] = entities
        return Message(message_data,
                       output_properties=set(message_data.keys()))
 def process(self, message: Message, **kwargs: Any) -> None:
     extracted = self._match_entities(message)
     message.set("entities",
                 message.get("entities", []) + extracted,
                 add_to_output=True)
Пример #11
0
def test_base_examples(example):
    message = Message({TEXT: example["text"]})
    tok = BlankSpacyTokenizer(component_config={"lang": example["lang"]})
    tok.process(message)
    tokens = message.get(TOKENS_NAMES[TEXT])
    assert [t.text for t in tokens] == example["result"]
Пример #12
0
 def test_classification(self, trained_classifier, message, intent):
     text = Message(data={TEXT: message})
     trained_classifier.process(text)
     assert text.get("intent").get("name", "NOT_CLASSIFIED") == intent
Пример #13
0
 def process(self, message: Message, **kwargs: Any) -> None:
     if message.get(TEXT) != None:
         message.set(TEXT, message.get(TEXT).title())
Пример #14
0
async def test_adjusting_layers_incremental_training(
    create_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    load_response_selector: Callable[[Dict[Text, Any]], ResponseSelector],
    train_and_preprocess: Callable[..., Tuple[TrainingData,
                                              List[GraphComponent]]],
    process_message: Callable[..., Message],
):
    """Tests adjusting sparse layers of `ResponseSelector` to increased sparse
    feature sizes during incremental training.

    Testing is done by checking the layer sizes.
    Checking if they were replaced correctly is also important
    and is done in `test_replace_dense_for_sparse_layers`
    in `test_rasa_layers.py`.
    """
    iter1_data_path = "data/test_incremental_training/iter1/"
    iter2_data_path = "data/test_incremental_training/"
    pipeline = [
        {
            "component": WhitespaceTokenizer
        },
        {
            "component": LexicalSyntacticFeaturizer
        },
        {
            "component": RegexFeaturizer
        },
        {
            "component": CountVectorsFeaturizer
        },
        {
            "component": CountVectorsFeaturizer,
            "analyzer": "char_wb",
            "min_ngram": 1,
            "max_ngram": 4,
        },
    ]
    training_data, loaded_pipeline = train_and_preprocess(
        pipeline, iter1_data_path)
    response_selector = create_response_selector({EPOCHS: 1})
    response_selector.train(training_data=training_data)

    old_data_signature = response_selector.model.data_signature
    old_predict_data_signature = response_selector.model.predict_data_signature

    message = Message(data={TEXT: "Rasa is great!"})
    message = process_message(loaded_pipeline, message)

    message2 = copy.deepcopy(message)

    classified_message = response_selector.process([message])[0]

    old_sparse_feature_sizes = classified_message.get_sparse_feature_sizes(
        attribute=TEXT)

    initial_rs_layers = response_selector.model._tf_layers[
        "sequence_layer.text"]._tf_layers["feature_combining"]
    initial_rs_sequence_layer = initial_rs_layers._tf_layers[
        "sparse_dense.sequence"]._tf_layers["sparse_to_dense"]
    initial_rs_sentence_layer = initial_rs_layers._tf_layers[
        "sparse_dense.sentence"]._tf_layers["sparse_to_dense"]

    initial_rs_sequence_size = initial_rs_sequence_layer.get_kernel().shape[0]
    initial_rs_sentence_size = initial_rs_sentence_layer.get_kernel().shape[0]
    assert initial_rs_sequence_size == sum(
        old_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE])
    assert initial_rs_sentence_size == sum(
        old_sparse_feature_sizes[FEATURE_TYPE_SENTENCE])

    loaded_selector = load_response_selector({EPOCHS: 1})

    classified_message2 = loaded_selector.process([message2])[0]

    assert classified_message2.fingerprint() == classified_message.fingerprint(
    )

    training_data2, loaded_pipeline2 = train_and_preprocess(
        pipeline, iter2_data_path)

    response_selector.train(training_data=training_data2)

    new_message = Message.build(text="Rasa is great!")
    new_message = process_message(loaded_pipeline2, new_message)

    classified_new_message = response_selector.process([new_message])[0]
    new_sparse_feature_sizes = classified_new_message.get_sparse_feature_sizes(
        attribute=TEXT)

    final_rs_layers = response_selector.model._tf_layers[
        "sequence_layer.text"]._tf_layers["feature_combining"]
    final_rs_sequence_layer = final_rs_layers._tf_layers[
        "sparse_dense.sequence"]._tf_layers["sparse_to_dense"]
    final_rs_sentence_layer = final_rs_layers._tf_layers[
        "sparse_dense.sentence"]._tf_layers["sparse_to_dense"]

    final_rs_sequence_size = final_rs_sequence_layer.get_kernel().shape[0]
    final_rs_sentence_size = final_rs_sentence_layer.get_kernel().shape[0]
    assert final_rs_sequence_size == sum(
        new_sparse_feature_sizes[FEATURE_TYPE_SEQUENCE])
    assert final_rs_sentence_size == sum(
        new_sparse_feature_sizes[FEATURE_TYPE_SENTENCE])
    # check if the data signatures were correctly updated
    new_data_signature = response_selector.model.data_signature
    new_predict_data_signature = response_selector.model.predict_data_signature
    iter2_data = load_data(iter2_data_path)
    expected_sequence_lengths = len([
        message for message in iter2_data.training_examples
        if message.get(INTENT_RESPONSE_KEY)
    ])

    def test_data_signatures(
        new_signature: Dict[Text, Dict[Text, List[FeatureArray]]],
        old_signature: Dict[Text, Dict[Text, List[FeatureArray]]],
    ):
        # Wherever attribute / feature_type signature is not
        # expected to change, directly compare it to old data signature.
        # Else compute its expected signature and compare
        attributes_expected_to_change = [TEXT]
        feature_types_expected_to_change = [
            FEATURE_TYPE_SEQUENCE,
            FEATURE_TYPE_SENTENCE,
        ]

        for attribute, signatures in new_signature.items():

            for feature_type, feature_signatures in signatures.items():

                if feature_type == "sequence_lengths":
                    assert feature_signatures[
                        0].units == expected_sequence_lengths

                elif feature_type not in feature_types_expected_to_change:
                    assert feature_signatures == old_signature.get(
                        attribute).get(feature_type)
                else:
                    for index, feature_signature in enumerate(
                            feature_signatures):
                        if (feature_signature.is_sparse and attribute
                                in attributes_expected_to_change):
                            assert feature_signature.units == sum(
                                new_sparse_feature_sizes.get(feature_type))
                        else:
                            # dense signature or attributes that are not
                            # expected to change can be compared directly
                            assert (
                                feature_signature.units == old_signature.get(
                                    attribute).get(feature_type)[index].units)

    test_data_signatures(new_data_signature, old_data_signature)
    test_data_signatures(new_predict_data_signature,
                         old_predict_data_signature)
Пример #15
0
 def compute_features(self, example: Message):
     features = self.model[example.get("text")]
     example.set("sentence_features", features)
    def process(self, message: Message, **kwargs: Any) -> None:
        from tokenizer_tools.tagset.NER.BILUO import BILUOSequenceEncoderDecoder
        from tokenizer_tools.tagset.offset.sequence import Sequence

        decoder = BILUOSequenceEncoderDecoder()

        real_result_dir = os.path.join(self.model_dir, self.result_dir)
        print(real_result_dir)

        input_text = message.text

        input_feature = {
            'words': [[i for i in input_text]],
            'words_len': [len(input_text)],
        }

        print(input_feature)

        predictions = self.predict_fn(input_feature)
        tags = predictions['tags'][0]
        # print(predictions['tags'])

        # decode Unicode
        tags_seq = [i.decode() for i in tags]

        print(tags_seq)

        # BILUO to offset
        failed = False
        try:
            seq = decoder.to_offset(tags_seq, input_text)
        except Exception as e:
            # invalid tag sequence will raise exception
            # so return a empty result
            logger.error("Decode error: {}".format(e))
            seq = Sequence(input_text)
            failed = True
        # print(seq)

        print(seq, tags_seq, failed)

        entity_set = []

        seq.span_set.fill_text(input_text)

        for span in seq.span_set:
            ent = {
                "entity": span.entity,
                "value": span.value,
                "start": span.start,
                "confidence": None,
                "end": span.end
            }

            entity_set.append(ent)

        extracted = self.add_extractor_name(entity_set)

        message.set("entities",
                    message.get("entities", []) + extracted,
                    add_to_output=True)
Пример #17
0
def test_duckling_entity_extractor(component_builder):
    with responses.RequestsMock() as rsps:
        rsps.add(
            responses.POST,
            "http://localhost:8000/parse",
            json=[
                {
                    "body": "Today",
                    "start": 0,
                    "value": {
                        "values": [{
                            "value": "2018-11-13T00:00:00.000-08:00",
                            "grain": "day",
                            "type": "value",
                        }],
                        "value":
                        "2018-11-13T00:00:00.000-08:00",
                        "grain":
                        "day",
                        "type":
                        "value",
                    },
                    "end": 5,
                    "dim": "time",
                    "latent": False,
                },
                {
                    "body": "the 5th",
                    "start": 9,
                    "value": {
                        "values": [
                            {
                                "value": "2018-12-05T00:00:00.000-08:00",
                                "grain": "day",
                                "type": "value",
                            },
                            {
                                "value": "2019-01-05T00:00:00.000-08:00",
                                "grain": "day",
                                "type": "value",
                            },
                            {
                                "value": "2019-02-05T00:00:00.000-08:00",
                                "grain": "day",
                                "type": "value",
                            },
                        ],
                        "value":
                        "2018-12-05T00:00:00.000-08:00",
                        "grain":
                        "day",
                        "type":
                        "value",
                    },
                    "end": 16,
                    "dim": "time",
                    "latent": False,
                },
                {
                    "body": "5th of May",
                    "start": 13,
                    "value": {
                        "values": [
                            {
                                "value": "2019-05-05T00:00:00.000-07:00",
                                "grain": "day",
                                "type": "value",
                            },
                            {
                                "value": "2020-05-05T00:00:00.000-07:00",
                                "grain": "day",
                                "type": "value",
                            },
                            {
                                "value": "2021-05-05T00:00:00.000-07:00",
                                "grain": "day",
                                "type": "value",
                            },
                        ],
                        "value":
                        "2019-05-05T00:00:00.000-07:00",
                        "grain":
                        "day",
                        "type":
                        "value",
                    },
                    "end": 23,
                    "dim": "time",
                    "latent": False,
                },
                {
                    "body": "tomorrow",
                    "start": 37,
                    "value": {
                        "values": [{
                            "value": "2018-11-14T00:00:00.000-08:00",
                            "grain": "day",
                            "type": "value",
                        }],
                        "value":
                        "2018-11-14T00:00:00.000-08:00",
                        "grain":
                        "day",
                        "type":
                        "value",
                    },
                    "end": 45,
                    "dim": "time",
                    "latent": False,
                },
            ],
        )

        _config = RasaNLUModelConfig(
            {"pipeline": [{
                "name": "DucklingEntityExtractor"
            }]})
        _config.set_component_attr(0,
                                   dimensions=["time"],
                                   timezone="UTC",
                                   url="http://localhost:8000")
        duckling = component_builder.create_component(_config.for_component(0),
                                                      _config)
        message = Message(
            data={TEXT: "Today is the 5th of May. Let us meet tomorrow."})
        duckling.process(message)
        entities = message.get("entities")
        assert len(entities) == 4

    # Test duckling with a defined date

    with responses.RequestsMock() as rsps:
        rsps.add(
            responses.POST,
            "http://localhost:8000/parse",
            json=[{
                "body": "tomorrow",
                "start": 12,
                "value": {
                    "values": [{
                        "value": "2013-10-13T00:00:00.000Z",
                        "grain": "day",
                        "type": "value",
                    }],
                    "value":
                    "2013-10-13T00:00:00.000Z",
                    "grain":
                    "day",
                    "type":
                    "value",
                },
                "end": 20,
                "dim": "time",
                "latent": False,
            }],
        )

        # 1381536182 == 2013/10/12 02:03:02
        message = Message(data={TEXT: "Let us meet tomorrow."},
                          time="1381536182")
        duckling.process(message)
        entities = message.get("entities")
        assert len(entities) == 1
        assert entities[0]["text"] == "tomorrow"
        assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"

        # Test dimension filtering includes only specified dimensions
        _config = RasaNLUModelConfig(
            {"pipeline": [{
                "name": "DucklingEntityExtractor"
            }]})
        _config.set_component_attr(0,
                                   dimensions=["number"],
                                   url="http://localhost:8000")
        duckling_number = component_builder.create_component(
            _config.for_component(0), _config)

    with responses.RequestsMock() as rsps:
        rsps.add(
            responses.POST,
            "http://localhost:8000/parse",
            json=[
                {
                    "body": "Yesterday",
                    "start": 0,
                    "value": {
                        "values": [{
                            "value": "2019-02-28T00:00:00.000+01:00",
                            "grain": "day",
                            "type": "value",
                        }],
                        "value":
                        "2019-02-28T00:00:00.000+01:00",
                        "grain":
                        "day",
                        "type":
                        "value",
                    },
                    "end": 9,
                    "dim": "time",
                },
                {
                    "body": "5",
                    "start": 21,
                    "value": {
                        "value": 5,
                        "type": "value"
                    },
                    "end": 22,
                    "dim": "number",
                },
            ],
        )

        message = Message(
            data={TEXT: "Yesterday there were 5 people in a room"})
        duckling_number.process(message)
        entities = message.get("entities")

        assert len(entities) == 1
        assert entities[0]["text"] == "5"
        assert entities[0]["value"] == 5
Пример #18
0
    def _features_for_patterns(
        self, message: Message, attribute: Text
    ) -> Tuple[Optional[scipy.sparse.coo_matrix],
               Optional[scipy.sparse.coo_matrix]]:
        """Checks which known patterns match the message.

        Given a sentence, returns a vector of {1,0} values indicating which
        regexes did match. Furthermore, if the
        message is tokenized, the function will mark all tokens with a dict
        relating the name of the regex to whether it was matched.

        Args:
            message: Message to be featurized.
            attribute: Attribute of message to be featurized.

        Returns:
           Token and sentence level features of message attribute.
        """
        # Attribute not set (e.g. response not present)
        if not message.get(attribute):
            return None, None

        tokens = message.get(TOKENS_NAMES[attribute], [])

        if not tokens:
            # nothing to featurize
            return None, None

        flags = 0  # default flag
        if not self.case_sensitive:
            flags = re.IGNORECASE

        sequence_length = len(tokens)

        num_patterns = len(self.known_patterns)

        sequence_features = np.zeros([sequence_length, num_patterns])
        sentence_features = np.zeros([1, num_patterns])

        for pattern_index, pattern in enumerate(self.known_patterns):
            matches = re.finditer(pattern["pattern"],
                                  message.get(attribute),
                                  flags=flags)
            matches = list(matches)

            for token_index, t in enumerate(tokens):
                patterns = t.get("pattern", default={})
                patterns[pattern["name"]] = False

                for match in matches:
                    if t.start < match.end() and t.end > match.start():
                        patterns[pattern["name"]] = True
                        sequence_features[token_index][pattern_index] = 1.0
                        if attribute in [RESPONSE, TEXT, ACTION_TEXT]:
                            # sentence vector should contain all patterns
                            sentence_features[0][pattern_index] = 1.0

                t.set("pattern", patterns)

        return (
            scipy.sparse.coo_matrix(sequence_features),
            scipy.sparse.coo_matrix(sentence_features),
        )
Пример #19
0
    def unpack_regex_message(
        message: Message,
        domain: Optional[Domain] = None,
        entity_extractor_name: Optional[Text] = None,
    ) -> Message:
        """Unpacks the message if `TEXT` contains an encoding of attributes.

        Args:
            message: some message
            domain: the domain
            entity_extractor_name: An extractor name which should be added for the
                entities.

        Returns:
            the given message if that message does not need to be unpacked, and a new
            message with the extracted attributes otherwise
        """
        user_text = message.get(TEXT).strip()

        # If the prefix doesn't match, we don't even need to try to match the pattern.
        if not user_text.startswith(INTENT_MESSAGE_PREFIX):
            return message

        # Try to match the pattern.
        match = YAMLStoryReader._regex_message_pattern().match(user_text)

        # If it doesn't match, then (potentially) something went wrong, because the
        # message text did start with the special prefix -- however, a user might
        # just have decided to start their text this way.
        if not match:
            logger.warning(f"Failed to parse intent end entities from '{user_text}'.")
            return message

        # Extract attributes from the match - and validate it via the domain.
        intent_name = YAMLStoryReader._intent_name_from_regex_match(match, domain)
        confidence = YAMLStoryReader._confidences_from_regex_match(match)
        entities = YAMLStoryReader._entities_from_regex_match(
            match, domain, entity_extractor_name
        )

        # The intent name is *not* optional, but during parsing we might find out
        # that the given intent is unknown (and warn). In this case, stop here.
        if intent_name is None:
            return message

        if match.group("rest"):
            rasa.shared.utils.io.raise_warning(
                f"Failed to parse arguments in line '{match.string}'. "
                f"Failed to interpret some parts. "
                f"Make sure your regex string is in the following format:"
                f"{INTENT_MESSAGE_PREFIX}"
                f"<intent_name>@<confidence-value><dictionary of entities> "
                f"Continuing without {match.group('rest')}. "
            )

        # Add the results to the message.
        intent_data = {
            INTENT_NAME_KEY: intent_name,
            PREDICTED_CONFIDENCE_KEY: confidence,
        }
        intent_ranking = [
            {INTENT_NAME_KEY: intent_name, PREDICTED_CONFIDENCE_KEY: confidence}
        ]
        message_data = {}
        message_data[TEXT] = user_text
        message_data[INTENT] = intent_data
        message_data[INTENT_RANKING_KEY] = intent_ranking
        message_data[ENTITIES] = entities
        return Message(message_data, output_properties=set(message_data.keys()))