예제 #1
0
    def parse(self,
              text: Text,
              time: Optional[datetime.datetime] = None,
              only_output_properties: bool = True) -> Dict[Text, Any]:
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one
            # should pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(
            message.as_dict(only_output_properties=only_output_properties))
        return output
예제 #2
0
    def _parse_intent_example(self, example_in_md):
        entities = []
        utter = example_in_md
        match = re.search(ent_regex, utter)
        while match is not None:
            entity_synonym = match.groupdict()['synonym']
            entity_entity = match.groupdict()['entity']
            entity_value = match.groupdict()['value']

            if match.groupdict()['value'] is None:
                entity_value = entity_synonym

            start_index = match.start()
            end_index = start_index + len(entity_synonym)

            entities.append({
                'entity': entity_entity,
                'value': entity_value,
                'start': start_index,
                'end': end_index
            })

            utter = utter[:match.start()] + entity_synonym + utter[match.end(
            ):]
            match = re.search(ent_regex, utter)

        message = Message(utter, {'intent': self.current_intent})
        if len(entities) > 0:
            message.set('entities', entities)
        return message
예제 #3
0
    def process(self, message: Message, **kwargs: Any) -> None:

        self._check_spacy_doc(message)

        extracted = self.add_extractor_name(self.extract_entities(message))
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #4
0
def test_unintentional_synonyms_capitalized(component_builder):
    _config = utilities.base_test_conf("all_components")
    ner_syn = component_builder.create_component("ner_synonyms", _config)
    examples = [
        Message(
            "Any Mexican restaurant will do", {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 4,
                    "end": 11,
                    "value": "Mexican",
                    "entity": "cuisine"
                }]
            }),
        Message(
            "I want Tacos!", {
                "intent":
                "restaurant_search",
                "entities": [{
                    "start": 7,
                    "end": 12,
                    "value": "Mexican",
                    "entity": "cuisine"
                }]
            })
    ]
    ner_syn.train(TrainingData(training_examples=examples), _config)
    assert ner_syn.synonyms.get("mexican") is None
    assert ner_syn.synonyms.get("tacos") == "Mexican"
def test_crf_extractor(spacy_nlp):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor()
    examples = [
        Message("anywhere in the west", {
            "intent": "restaurant_search",
            "entities": [{"start": 16, "end": 20, "value": "west", "entity": "location"}],
            "spacy_doc": spacy_nlp("anywhere in the west")
        }),
        Message("central indian restaurant", {
            "intent": "restaurant_search",
            "entities": [{"start": 0, "end": 7, "value": "central", "entity": "location"}],
            "spacy_doc": spacy_nlp("central indian restaurant")
        })]
    config = {"ner_crf": {"BILOU_flag": True, "features": ext.crf_features}}
    ext.train(TrainingData(training_examples=examples), config)
    sentence = 'anywhere in the west'
    crf_format = ext._from_text_to_crf(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
    assert [word[0] for word in crf_format] == ['anywhere', 'in', 'the', 'west']
    feats = ext._sentence_to_features(crf_format)
    assert 'BOS' in feats[0]
    assert 'EOS' in feats[-1]
    assert feats[1]['0:low'] == "in"
    sentence = 'anywhere in the west'
    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
예제 #6
0
    def _from_json_to_crf(
        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
    ) -> List[Tuple[Text, Text, Text, Text]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse

            doc = message.get("spacy_doc")
            gold = GoldParse(doc, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(tokens, entity_offsets)

        if '-' in ents:
            logger.warning("Misaligned entity annotation in sentence '{}'. "
                           "Make sure the start and end values of the "
                           "annotated training examples end at token "
                           "boundaries (e.g. don't include trailing "
                           "whitespaces or punctuation)."
                           "".format(message.text))
        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
    def process(self, message: Message, **kwargs: Any) -> None:
        """Return the most likely intent and its probability for a message."""

        if not self.clf:
            # component is either not trained or didn't
            # receive enough training data
            intent = None
            intent_ranking = []
        else:
            X = message.get("text_features").reshape(1, -1)
            intent_ids, probabilities = self.predict(X)
            intents = self.transform_labels_num2str(np.ravel(intent_ids))
            # `predict` returns a matrix as it is supposed
            # to work for multiple examples as well, hence we need to flatten
            probabilities = probabilities.flatten()

            if intents.size > 0 and probabilities.size > 0:
                ranking = list(
                    zip(list(intents),
                        list(probabilities)))[:INTENT_RANKING_LENGTH]

                intent = {"name": intents[0], "confidence": probabilities[0]}

                intent_ranking = [{
                    "name": intent_name,
                    "confidence": score
                } for intent_name, score in ranking]
            else:
                intent = {"name": None, "confidence": 0.0}
                intent_ranking = []

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
예제 #8
0
    def _parse_intent_example(self, example_in_md):
        entities = []
        utter = example_in_md
        for regex in [ent_regex, ent_regex_with_value]:
            utter = re.sub(regex, r"\1", utter)  # [text](entity) -> text
            ent_matches = re.finditer(regex, example_in_md)
            for matchNum, match in enumerate(ent_matches):
                if 'synonym' in match.groupdict():
                    entity_value_in_utter = match.groupdict()['synonym']
                else:
                    entity_value_in_utter = match.groupdict()['value']

                start_index = utter.index(entity_value_in_utter)
                end_index = start_index + len(entity_value_in_utter)

                entities.append({
                    'entity': match.groupdict()['entity'],
                    'value': match.groupdict()['value'],
                    'start': start_index,
                    'end': end_index
                })

        message = Message(utter, {'intent': self.current_intent})
        if len(entities) > 0:
            message.set('entities', entities)
        return message
예제 #9
0
    def parse(self, text, time=None):
        # type: (Text) -> Dict[Text, Any]
        """Parse the input text, classify it and return pipeline result.

        The pipeline result usually contains intent and entities."""

        if not text:
            # Not all components are able to handle empty strings. So we need
            # to prevent that... This default return will not contain all
            # output attributes of all components, but in the end, no one should
            # pass an empty string in the first place.
            output = self.default_output_attributes()
            output["text"] = ""
            return output

        message = Message(text, self.default_output_attributes(), time=time)
        print('[DEBUG] Message Output Attributes: %s' % self.default_output_attributes())
        print('[DEBUG] Message Text: %s' % message.text)
        print('[DEBUG] Message Data: %s' % message.data)

        for component in self.pipeline:
            component.process(message, **self.context)

        output = self.default_output_attributes()
        output.update(message.as_dict(only_output_properties=True))
        return output
예제 #10
0
def test_duckling_entity_extractor(component_builder):
    _config = utilities.base_test_conf("all_components")
    _config["duckling_dimensions"] = ["time"]
    duckling = component_builder.create_component("ner_duckling", _config)
    message = Message("Today is the 5th of May. Let us meet tomorrow.")
    duckling.process(message)
    entities = message.get("entities")
    assert len(entities) == 3
예제 #11
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
        features = self.features_for_tokens(message.get("tokens"),
                                            mitie_feature_extractor)
        message.set(
            "text_features",
            self._combine_with_existing_text_features(message, features))
예제 #12
0
 def _parse_training_example(self, example):
     """Extract entities and synonyms, and convert to plain text."""
     entities = self._find_entities_in_training_example(example)
     plain_text = re.sub(ent_regex, lambda m: m.groupdict()['entity_text'], example)
     self._add_synonyms(plain_text, entities)
     message = Message(plain_text, {'intent': self.current_title})
     if len(entities) > 0:
         message.set('entities', entities)
     return message
예제 #13
0
 def process(self, message: Message, **kwargs: Any) -> None:
     # can't use the existing doc here (spacy_doc on the message)
     # because tokens are lower cased which is bad for NER
     spacy_nlp = kwargs.get("spacy_nlp", None)
     doc = spacy_nlp(message.text)
     extracted = self.add_extractor_name(self.extract_entities(doc))
     message.set("entities",
                 message.get("entities", []) + extracted,
                 add_to_output=True)
예제 #14
0
    def process(self, message: Message, **kwargs: Any) -> None:
        if self.vect is None:
            logger.error("There is no trained CountVectorizer: "
                         "component is either not trained or "
                         "didn't receive enough training data")
        else:
            message_text = self._get_message_text(message)

            bag = self.vect.transform([message_text]).toarray().squeeze()
            message.set(
                "text_features",
                self._combine_with_existing_text_features(message, bag))
예제 #15
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        ents = self.extract_entities(message.text, message.get("tokens"),
                                     mitie_feature_extractor)
        extracted = self.add_extractor_name(ents)
        message.set("entities", message.get("entities", []) + extracted,
                    add_to_output=True)
예제 #16
0
 def process(self, message: Message, **kwargs: Any) -> None:
     # can't use the existing doc here (spacy_doc on the message)
     # because tokens are lower cased which is bad for NER
     spacy_nlp = kwargs.get("spacy_nlp", None)
     doc = spacy_nlp(message.text)
     all_extracted = self.add_extractor_name(self.extract_entities(doc))
     dimensions = self.component_config["dimensions"]
     extracted = SpacyEntityExtractor.filter_irrelevant_entities(
         all_extracted, dimensions)
     message.set("entities",
                 message.get("entities", []) + extracted,
                 add_to_output=True)
예제 #17
0
def test_spacy_ner_extractor(spacy_nlp):
    ext = SpacyEntityExtractor()
    example = Message("anywhere in the West", {
            "intent": "restaurant_search",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the west")})

    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        u'start': 16, u'extractor': u'ner_spacy',
        u'end': 20, u'value': u'West', u'entity': u'LOC'}
예제 #18
0
def test_count_vector_featurizer(sentence, expected):
    from rasa_nlu.featurizers.count_vectors_featurizer import \
        CountVectorsFeaturizer

    ftr = CountVectorsFeaturizer({"token_pattern": r'(?u)\b\w+\b'})
    message = Message(sentence)
    message.set("intent", "bla")
    data = TrainingData([message])

    ftr.train(data)
    ftr.process(message)

    assert np.all(message.get("text_features")[0] == expected)
예제 #19
0
    def neg_featurize(self, neg_train_data):
        """
        Use the previously trained featurizers in the pipeline to featurize the negative training data (a set of sentences)
        """

        X_neg = []
        for example in neg_train_data:
            m = Message(example)
            self.partially_process(m)
            #print("message: {}; intent: {}".format(example, m.get("intent")))
            X_neg.append(m.get("text_features"))

        X_neg = np.array(X_neg)
        return X_neg
예제 #20
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Return the most likely intent and its similarity to the input"""

        # Classifier needs this to be non empty, so we set to first label.
        message.data["intent"] = self.label_list[0]

        predict_examples = get_test_examples([message])
        predict_features = convert_examples_to_features(
            predict_examples, self.label_list, self.max_seq_length, self.tokenizer
        )

        # Get first index since we are only classifying text blob at a time.
        example = predict_features[0]

        result = self.predict_fn(
            {
                "input_ids": np.array(example.input_ids).reshape(
                    -1, self.max_seq_length
                ),
                "input_mask": np.array(example.input_mask).reshape(
                    -1, self.max_seq_length
                ),
                "label_ids": np.array(example.label_id).reshape(-1),
                "segment_ids": np.array(example.segment_ids).reshape(
                    -1, self.max_seq_length
                ),
            }
        )

        probabilities = list(np.exp(result["probabilities"])[0])

        with self.session.as_default():
            index = tf.argmax(probabilities, axis=0).eval(session=tf.Session())
            label = self.label_list[index]
            score = float(probabilities[index])

            intent = {"name": label, "confidence": score}
            intent_ranking = sorted(
                [
                    {"name": self.label_list[i], "confidence": float(score)}
                    for i, score in enumerate(probabilities)
                ],
                key=lambda k: k["confidence"],
                reverse=True,
            )

        message.set("intent", intent, add_to_output=True)
        message.set("intent_ranking", intent_ranking, add_to_output=True)
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    data = _read_json_from_file(filename)
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    entity_synonyms = get_entity_synonyms_dict(synonyms)

    if intent or entity:
        logger.warn("DEPRECATION warning: Data file contains 'intent_examples' "
                    "or 'entity_examples' which will be "
                    "removed in the future. Consider putting all your examples "
                    "into the 'common_examples' section.")

    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = e.copy()
        if "text" in data:
            del data["text"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
예제 #22
0
파일: rasa.py 프로젝트: hee2000/rasa_nlu
    def read_from_json(self, js, **kwargs):
        """Loads training data stored in the rasa NLU data format."""
        validate_rasa_nlu_data(js)

        data = js['rasa_nlu_data']
        common_examples = data.get("common_examples", [])
        intent_examples = data.get("intent_examples", [])
        entity_examples = data.get("entity_examples", [])
        entity_synonyms = data.get("entity_synonyms", [])
        regex_features = data.get("regex_features", [])
        lookup_tables = data.get("lookup_tables", [])

        entity_synonyms = transform_entity_synonyms(entity_synonyms)

        if intent_examples or entity_examples:
            logger.warning("DEPRECATION warning: your rasa data "
                           "contains 'intent_examples' "
                           "or 'entity_examples' which will be "
                           "removed in the future. Consider "
                           "putting all your examples "
                           "into the 'common_examples' section.")

        all_examples = common_examples + intent_examples + entity_examples
        training_examples = []
        for ex in all_examples:
            msg = Message.build(ex['text'], ex.get("intent"),
                                ex.get("entities"))
            training_examples.append(msg)

        return TrainingData(training_examples, entity_synonyms, regex_features,
                            lookup_tables)
def load_wit_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the WIT.ai data format."""

    training_examples = []

    data = _read_json_from_file(filename)
    for s in data["data"]:
        entities = s.get("entities")
        if entities is None:
            continue
        text = s.get("text")
        intents = [e["value"] for e in entities if e["entity"] == 'intent']
        intent = intents[0].strip("\"") if intents else None

        entities = [e
                    for e in entities
                    if ("start" in e and "end" in e and
                        e["entity"] != 'intent')]
        for e in entities:
            # for some reason wit adds additional quotes around entity values
            e["value"] = e["value"].strip("\"")

        data = {}
        if intent:
            data["intent"] = intent
        if entities is not None:
            data["entities"] = entities
        training_examples.append(Message(text, data))
    return TrainingData(training_examples)
예제 #24
0
def load_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())


    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())
    entity_synonyms = get_entity_synonyms_dict(synonyms)


    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = e.copy()
        if "text" in data:
            del data["text"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
예제 #25
0
    def filter_trainable_entities(
            self, entity_examples: List[Message]) -> List[Message]:
        """Filters out untrainable entity annotations.

        Creates a copy of entity_examples in which entities that have
        `extractor` set to something other than
        self.name (e.g. 'CRFEntityExtractor') are removed.
        """

        filtered = []
        for message in entity_examples:
            entities = []
            for ent in message.get("entities", []):
                extractor = ent.get("extractor")
                if not extractor or extractor == self.name:
                    entities.append(ent)
            data = message.data.copy()
            data['entities'] = entities
            filtered.append(
                Message(text=message.text,
                        data=data,
                        output_properties=message.output_properties,
                        time=message.time))

        return filtered
예제 #26
0
def load_train_data(data):
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    entity_synonyms = get_entity_synonyms_dict(synonyms)

    if intent or entity:
        logger.warn(
            "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be "
            +
            "removed in the future. Consider putting all your examples into the 'common_examples' section."
        )

    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = {}
        if e.get("intent"):
            data["intent"] = e["intent"]
        if e.get("entities") is not None:
            data["entities"] = e["entities"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
def load_rasa_data(filename):
    # type: (Text) -> TrainingData
    """Loads training data stored in the rasa NLU data format."""

    with io.open(filename, encoding="utf-8-sig") as f:
        data = json.loads(f.read())
    validate_rasa_nlu_data(data)

    common = data['rasa_nlu_data'].get("common_examples", list())
    intent = data['rasa_nlu_data'].get("intent_examples", list())
    entity = data['rasa_nlu_data'].get("entity_examples", list())
    regex_features = data['rasa_nlu_data'].get("regex_features", list())
    synonyms = data['rasa_nlu_data'].get("entity_synonyms", list())

    entity_synonyms = get_entity_synonyms_dict(synonyms)

    if intent or entity:
        logger.warn(
            "DEPRECATION warning: Data file contains 'intent_examples' or 'entity_examples' which will be "
            +
            "removed in the future. Consider putting all your examples into the 'common_examples' section."
        )

    all_examples = common + intent + entity
    training_examples = []
    for e in all_examples:
        data = {}
        if e.get("intent"):
            data["intent"] = e["intent"]
        if e.get("entities") is not None:
            data["entities"] = e["entities"]
        training_examples.append(Message(e["text"], data))

    return TrainingData(training_examples, entity_synonyms, regex_features)
예제 #28
0
    def read_from_json(self, js: Dict[Text, Any], **kwargs: Any):
        """Loads training data stored in the WIT.ai data format."""
        from rasa_nlu.training_data import Message, TrainingData

        training_examples = []

        for s in js["data"]:
            entities = s.get("entities")
            if entities is None:
                continue
            text = s.get("text")
            intents = [e["value"] for e in entities if e["entity"] == 'intent']
            intent = intents[0].strip("\"") if intents else None

            entities = [
                e for e in entities
                if ("start" in e and "end" in e and e["entity"] != 'intent')
            ]
            for e in entities:
                # for some reason wit adds additional quotes around entities
                e["value"] = e["value"].strip("\"")

            data = {}
            if intent:
                data["intent"] = intent
            if entities is not None:
                data["entities"] = entities
            training_examples.append(Message(text, data))
        return TrainingData(training_examples)
예제 #29
0
def test_crf_json_from_BILOU(spacy_nlp):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor()
    ext.BILOU_flag = True
    sentence = u"I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    r = ext._from_crf_to_json(Message(sentence, doc),
                              [{'O': 1.0},
                               {'O': 1.0},
                               {'O': 1.0},
                               {'B-what': 1.0},
                               {'L-what': 1.0},
                               {'B-where': 1.0},
                               {'I-where': 1.0},
                               {'L-where': 1.0}])
    assert len(r) == 2, "There should be two entities"

    assert r[0]["confidence"]  # confidence should exist
    del r[0]["confidence"]
    assert r[0] == {'start': 9, 'end': 22,
                    'value': 'home cleaning', 'entity': 'what'}

    assert r[1]["confidence"]  # confidence should exist
    del r[1]["confidence"]
    assert r[1] == {'start': 23, 'end': 31,
                    'value': 'close-by', 'entity': 'where'}
예제 #30
0
def test_crf_json_from_non_BILOU(spacy_nlp):
    from rasa_nlu.extractors.crf_entity_extractor import CRFEntityExtractor
    ext = CRFEntityExtractor()
    ext.BILOU_flag = False
    sentence = u"I need a home cleaning close-by"
    doc = {"spacy_doc": spacy_nlp(sentence)}
    rs = ext._from_crf_to_json(Message(sentence, doc),
                               [{'O': 1.0},
                                {'O': 1.0},
                                {'O': 1.0},
                                {'what': 1.0},
                                {'what': 1.0},
                                {'where': 1.0},
                                {'where': 1.0},
                                {'where': 1.0}])

    # non BILOU will split multi-word entities - hence 5
    assert len(rs) == 5, "There should be five entities"

    for r in rs:
        assert r['confidence']  # confidence should exist
        del r['confidence']

    assert rs[0] == {'start': 9, 'end': 13,
                     'value': 'home', 'entity': 'what'}
    assert rs[1] == {'start': 14, 'end': 22,
                     'value': 'cleaning', 'entity': 'what'}
    assert rs[2] == {'start': 23, 'end': 28,
                     'value': 'close', 'entity': 'where'}
    assert rs[3] == {'start': 28, 'end': 29,
                     'value': '-', 'entity': 'where'}
    assert rs[4] == {'start': 29, 'end': 31,
                     'value': 'by', 'entity': 'where'}