Пример #1
0
    def _combine_with_existing_sparse_features(
        message: Message,
        additional_features: Any,
        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT],
    ) -> Any:
        if additional_features is None:
            return

        if message.get(feature_name) is not None:
            from scipy.sparse import hstack

            if message.get(
                    feature_name).shape[0] != additional_features.shape[0]:
                raise ValueError(
                    f"Cannot concatenate sparse features as sequence dimension does not "
                    f"match: {message.get(feature_name).shape[0]} != "
                    f"{additional_features.shape[0]}. Message: '{message.text}'."
                )
            return hstack([message.get(feature_name), additional_features])
        else:
            return additional_features
Пример #2
0
    def set_fasttext_features(self,
                              message: Message,
                              attribute: Text = TEXT) -> None:
        tokens = message.get(TOKENS_NAMES[attribute])

        if not tokens:
            return None

        text_vector = self.model.get_word_vector(message.text)
        word_vectors = [
            self.model.get_word_vector(t.text)
            for t in train_utils.tokens_without_cls(message, attribute)
        ]
        X = np.array(word_vectors +
                     [text_vector])  # remember, we need one for __CLS__

        features = self._combine_with_existing_dense_features(
            message,
            additional_features=X,
            feature_name=DENSE_FEATURE_NAMES[attribute])
        message.set(DENSE_FEATURE_NAMES[attribute], features)
Пример #3
0
    def _combine_with_existing_dense_features(
        message: Message,
        additional_features: Any,
        feature_name: Text = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
    ) -> Any:
        if message.get(feature_name) is not None:

            if len(message.get(feature_name)) != len(additional_features):
                raise ValueError(
                    f"Cannot concatenate dense features as sequence dimension does not "
                    f"match: {len(message.get(feature_name))} != "
                    f"{len(additional_features)}. "
                    f"Make sure to set 'return_sequence' to the same value for all your "
                    f"featurizers."
                )

            return np.concatenate(
                (message.get(feature_name), additional_features), axis=-1
            )
        else:
            return additional_features
Пример #4
0
def test_jieba_load_dictionary(tmpdir_factory):
    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath

    component_config = {"dictionary_path": dictionary_path}

    with patch.object(JiebaTokenizer,
                      "load_custom_dictionary",
                      return_value=None) as mock_method:
        tk = JiebaTokenizer(component_config)
        tk.tokenize(Message(""), attribute=TEXT_ATTRIBUTE)

    mock_method.assert_called_once_with(dictionary_path)
Пример #5
0
def test_count_vector_featurizer():
    sentence, expected, expected_cls = samples[1]
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)
    show_message(train_message)
    ftr.train(TrainingData([train_message]))
    show_message(train_message)
    ftr.process(test_message)

    assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]),
                      scipy.sparse.coo_matrix)

    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()

    assert np.all(actual[0] == expected)
    assert np.all(actual[-1] == expected_cls)
Пример #6
0
    def process(self, message: Message, **kwargs: Any):

        spans = message.get("spans", [])
        pronouns = [span for span in spans if span['label'] == 'Pronoun']
        coreferences = []
        for pronoun in pronouns:
            ent = self.stag(pronoun, message)
            if ent:
                coreferences.append({
                    "pronoun": {
                        'start': pronoun['start'],
                        'end': pronoun['end']
                    },
                    "entity": {
                        'start': ent['start'],
                        'end': ent['end']
                    }
                })
        span_output_format(spans)
        message.set("coreferences", coreferences, add_to_output=True)
        logging.info("coref data: {}".format(message.data))
def test_text_featurizer_window_size(sentence, expected, expected_cls):
    featurizer = LexicalSyntacticFeaturizer(
        {"features": [["upper"], ["digit"], ["low"], ["digit"]]})

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    featurizer.train(TrainingData([train_message]))

    featurizer.process(test_message)

    assert isinstance(test_message.get(SPARSE_FEATURE_NAMES[TEXT]),
                      scipy.sparse.coo_matrix)

    actual = test_message.get(SPARSE_FEATURE_NAMES[TEXT]).toarray()

    assert np.all(actual[0] == expected)
    assert np.all(actual[-1] == expected_cls)
Пример #8
0
 def process(self, message: Message, **kwargs):
     """Process an incoming message.
         判断人物实体的性别
         1. 提取人物实体
         2. 判读实体性别
     """
     entities = message.get("entities", [])
     for ent in entities:
         if ent['dim'] == 'Nh':
             """如果是人实体, 则..."""
             _g = self.n2g.predict(ent['value'])[0]
             ent.update({'gender': self.int2label[_g]})
Пример #9
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process incoming message and compute and set features"""

        if self.vectorizers is None:
            logger.error("There is no trained CountVectorizer: "
                         "component is either not trained or "
                         "didn't receive enough training data")
            return

        attribute = TEXT
        message_tokens = self._get_processed_message_tokens_by_attribute(
            message, attribute)

        # features shape (1, seq, dim)
        features = self._create_sequence(attribute, [message_tokens])

        if features[0] is not None:
            final_features = Features(
                features[0], attribute,
                self.component_config[FEATURIZER_CLASS_ALIAS])
            message.add_features(final_features)
Пример #10
0
def test_count_vector_featurizer(sentence, expected, expected_cls):
    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})

    train_message = Message(sentence)
    test_message = Message(sentence)

    WhitespaceTokenizer().process(train_message)
    WhitespaceTokenizer().process(test_message)

    ftr.train(TrainingData([train_message]))

    ftr.process(test_message)

    vecs = test_message.get_sparse_features(TEXT, [])

    assert isinstance(vecs, scipy.sparse.coo_matrix)

    actual_vecs = vecs.toarray()

    assert np.all(actual_vecs[0] == expected)
    assert np.all(actual_vecs[-1] == expected_cls)
Пример #11
0
def tags_to_ids(message: Message, tag_id_dict: Dict[Text, int]) -> List[int]:
    """Maps the entity tags of the message to the ids of the provided dict.

    Args:
        message: the message
        tag_id_dict: mapping of tags to ids

    Returns: a list of tag ids
    """
    if message.get(BILOU_ENTITIES):
        _tags = [
            tag_id_dict[_tag]
            if _tag in tag_id_dict else tag_id_dict[NO_ENTITY_TAG]
            for _tag in message.get(BILOU_ENTITIES)
        ]
    else:
        _tags = [
            tag_id_dict[NO_ENTITY_TAG] for _ in message.get(TOKENS_NAMES[TEXT])
        ]

    return _tags
Пример #12
0
def test_convert_featurizer_process():
    featurizer = ConveRTFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    tokens = ConveRTTokenizer().tokenize(message, attribute=TEXT)
    tokens = Tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message)

    expected = np.array(
        [2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
Пример #13
0
    def process(self, message: Message, **kwargs: Any) -> None:

        mitie_feature_extractor = kwargs.get("mitie_feature_extractor")
        if not mitie_feature_extractor:
            raise Exception("Failed to train 'MitieFeaturizer'. "
                            "Missing a proper MITIE feature extractor.")

        if self.clf:
            token_strs = self._tokens_of_message(message)
            intent, confidence = self.clf(token_strs, mitie_feature_extractor)
        else:
            # either the model didn't get trained or it wasn't
            # provided with any data
            intent = None
            confidence = 0.0

        message.set("intent", {
            "name": intent,
            "confidence": confidence
        },
                    add_to_output=True)
Пример #14
0
def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer

    doc = spacy_nlp(sentence)
    token_vectors = [t.vector for t in doc]

    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())

    greet = {"intent": "greet", "text_features": [0.5]}

    message = Message(sentence, greet)
    message.set(SPACY_DOCS[TEXT], doc)

    ftr._set_spacy_features(message)

    seq_vecs, sen_vecs = message.get_dense_features(TEXT, [])
    vecs = seq_vecs[0][:5]

    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
    assert np.allclose(vecs, expected, atol=1e-4)
    assert sen_vecs is not None
Пример #15
0
    def _from_crf_to_json(
        self, message: Message, entities: List[Any]
    ) -> List[Dict[Text, Any]]:

        if self.pos_features:
            tokens = message.get("spacy_doc")
        else:
            tokens = message.get("tokens")

        if len(tokens) != len(entities):
            raise Exception(
                "Inconsistency in amount of tokens between crfsuite and message"
            )

        if self.component_config["BILOU_flag"]:
            return self._convert_bilou_tagging_to_entity_result(
                message, tokens, entities
            )
        else:
            # not using BILOU tagging scheme, multi-word entities are split.
            return self._convert_simple_tagging_to_entity_result(tokens, entities)
Пример #16
0
    def _read_intent(self, intent_js, examples_js):
        """Reads the intent and examples from respective jsons."""
        from rasa.nlu.training_data import Message, TrainingData

        intent = intent_js.get("name")

        training_examples = []
        for ex in examples_js:
            text, entities = self._join_text_chunks(ex["data"])
            training_examples.append(Message.build(text, intent, entities))

        return TrainingData(training_examples)
Пример #17
0
def test_mitie_featurizer(mitie_feature_extractor):

    featurizer = MitieFeaturizer.create({}, RasaNLUModelConfig())

    sentence = "Hey how are you today"
    message = Message(sentence)
    MitieTokenizer().process(message)
    tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])

    vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)

    expected = np.array([
        0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00,
        -8.26445103e00
    ])
    expected_cls = np.array(
        [0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])

    assert 6 == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
Пример #18
0
def map_message_entities(message: Message) -> List[Tuple[int, int, Text]]:
    """Maps the entities of the given message to their start, end, and tag values.

    Args:
        message: the message

    Returns: a list of start, end, and tag value tuples
    """
    def convert_entity(entity: Dict[Text, Any]) -> Tuple[int, int, Text]:
        return entity["start"], entity["end"], entity["entity"]

    return [convert_entity(entity) for entity in message.get(ENTITIES, [])]
Пример #19
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer

    lookups = [
        {
            "name":
            "drinks",
            "elements":
            ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
        },
        {
            "name": "plates",
            "elements": "data/test/lookup_tables/plates.txt"
        },
    ]
    ftr = RegexFeaturizer()
    ftr.add_lookup_tables(lookups)

    # adds tokens to the message
    component_config = {"name": "SpacyTokenizer"}
    tokenizer = SpacyTokenizer(component_config)
    message = Message(sentence)
    message.set("text_spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr._features_for_patterns(message, TEXT)
    assert np.allclose(result.toarray(), expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get(TOKENS_NAMES[TEXT], [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Пример #20
0
def test_train_tokenizer(text, expected_tokens, expected_indices):
    tk = WhitespaceTokenizer()

    message = Message(text)
    message.set(RESPONSE_ATTRIBUTE, text)
    message.set(INTENT_ATTRIBUTE, text)

    training_data = TrainingData()
    training_data.training_examples = [message]

    tk.train(training_data)

    for attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]:
        tokens = training_data.training_examples[0].get(
            TOKENS_NAMES[attribute])

        assert [t.text for t in tokens] == expected_tokens
        assert [t.start for t in tokens] == [i[0] for i in expected_indices]
        assert [t.end for t in tokens] == [i[1] for i in expected_indices]

    # check intent attribute
    tokens = training_data.training_examples[0].get(
        TOKENS_NAMES[INTENT_ATTRIBUTE])

    assert [t.text for t in tokens] == [text]
Пример #21
0
def test_crf_use_dense_features(spacy_nlp):
    crf_extractor = CRFEntityExtractor(
        component_config={
            "features": [
                ["low", "title", "upper", "pos", "pos2"],
                [
                    "low",
                    "suffix3",
                    "suffix2",
                    "upper",
                    "title",
                    "digit",
                    "pos",
                    "pos2",
                    "text_dense_features",
                ],
                ["low", "title", "upper", "pos", "pos2"],
            ]
        }
    )

    spacy_featurizer = SpacyFeaturizer()
    spacy_tokenizer = SpacyTokenizer()

    text = "Rasa is a company in Berlin"
    message = Message(text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    spacy_tokenizer.process(message)
    spacy_featurizer.process(message)

    text_data = crf_extractor._from_text_to_crf(message)
    features = crf_extractor._sentence_to_features(text_data)

    assert "0:text_dense_features" in features[0]
    for i in range(0, len(message.data.get("text_dense_features")[0])):
        assert (
            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
            == message.data.get("text_dense_features")[0][i]
        )
Пример #22
0
    def _from_json_to_crf(
        self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
    ) -> List[Tuple[Optional[Text], Optional[Text], Text, Dict[Text, Any]]]:
        """Convert json examples to format of underlying crfsuite."""

        if self.pos_features:
            from spacy.gold import GoldParse  # pytype: disable=import-error

            doc_or_tokens = message.get("spacy_doc")
            gold = GoldParse(doc_or_tokens, entities=entity_offsets)
            ents = [l[5] for l in gold.orig_annot]
        else:
            doc_or_tokens = message.get("tokens")
            ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)

        # collect badly annotated examples
        collected = []
        for t, e in zip(doc_or_tokens, ents):
            if e == "-":
                collected.append(t)
            elif collected:
                collected_text = " ".join([t.text for t in collected])
                logger.warning(
                    "Misaligned entity annotation for '{}' "
                    "in sentence '{}' with intent '{}'. "
                    "Make sure the start and end values of the "
                    "annotated training examples end at token "
                    "boundaries (e.g. don't include trailing "
                    "whitespaces or punctuation)."
                    "".format(collected_text, message.text, message.get("intent"))
                )
                collected = []

        if not self.component_config["BILOU_flag"]:
            for i, label in enumerate(ents):
                if self._bilou_from_label(label) in {"B", "I", "U", "L"}:
                    # removes BILOU prefix from label
                    ents[i] = self._entity_from_label(label)

        return self._from_text_to_crf(message, ents)
Пример #23
0
async def test_get_nlu_data(Faker: asynctest.MagicMock, load_data: asynctest.MagicMock) -> None:
    faker_ = Faker()
    faker_.name.return_value = "Nikola Tesla"
    training_data = TrainingData(
        training_examples=[
            Message.build("hello", "intent_test"),
            Message.build("hello @name", "intent_test"),
            Message.build("hello"),
        ]
    )
    load_data.return_value = training_data

    importer = PlaceholderImporter()
    importer.config = {"importers": [{"name": "rasam.PlaceholderImporter"}]}
    importer._nlu_files = ["test"]
    new_training_data = await importer.get_nlu_data()

    faker_.seed_instance.assert_called_once_with(importer.DEFAULT_FAKE_DATA_COUNT)
    load_data.assert_called_once_with("test", "en")
    message: Message
    expected_messages = [
        Message.build("hello", "intent_test"),
        Message.build("hello Nikola Tesla", "intent_test"),
        Message.build("hello"),
    ]
    for message, expected in zip(new_training_data.training_examples, expected_messages):
        assert message.get("intent") == expected.get("intent")
        assert message.get("text") == expected.get("text")
Пример #24
0
def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer

    patterns = [
        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
    ]
    ftr = RegexFeaturizer(known_patterns=patterns)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert num_matches == labeled_tokens.count(i)
Пример #25
0
def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer

    lookups = [
        {"name": 'drinks', "elements": ["mojito", "lemonade",
                                        "sweet berry wine",
                                        "tea", "club?mate"]},
        {"name": 'plates', "elements": "data/test/lookup_tables/plates.txt"}
    ]
    ftr = RegexFeaturizer(lookup_tables=lookups)

    # adds tokens to the message
    tokenizer = SpacyTokenizer()
    message = Message(sentence)
    message.set("spacy_doc", spacy_nlp(sentence))
    tokenizer.process(message)

    result = ftr.features_for_patterns(message)
    assert np.allclose(result, expected, atol=1e-10)

    # the tokenizer should have added tokens
    assert len(message.get("tokens", [])) > 0
    # the number of regex matches on each token should match
    for i, token in enumerate(message.get("tokens")):
        token_matches = token.get("pattern").values()
        num_matches = sum(token_matches)
        assert(num_matches == labeled_tokens.count(i))
Пример #26
0
    def tokenize(self, message: Message, attribute: Text) -> List[Token]:
        import mitie

        text = message.get(attribute)

        encoded_sentence = text.encode(DEFAULT_ENCODING)
        tokenized = mitie.tokenize_with_offsets(encoded_sentence)
        tokens = [
            self._token_from_offset(token, offset, encoded_sentence)
            for token, offset in tokenized
        ]

        return self._apply_token_pattern(tokens)
Пример #27
0
    def process(self, message: Message, **kwargs: Any) -> None:
        """Process incoming message and compute and set features"""

        if self.vectorizers is None:
            logger.error("There is no trained CountVectorizer: "
                         "component is either not trained or "
                         "didn't receive enough training data")
        else:
            message_text = self._get_message_text_by_attribute(
                message, attribute=MESSAGE_TEXT_ATTRIBUTE)

            bag = (self.vectorizers[MESSAGE_TEXT_ATTRIBUTE].transform(
                [message_text]).toarray().squeeze())
            message.set(
                MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
                self._combine_with_existing_features(
                    message,
                    bag,
                    feature_name=MESSAGE_VECTOR_FEATURE_NAMES[
                        MESSAGE_TEXT_ATTRIBUTE],
                ),
            )
Пример #28
0
def test_convert_featurizer_process(component_builder):
    tokenizer = component_builder.create_component_from_class(ConveRTTokenizer)
    featurizer = component_builder.create_component_from_class(ConveRTFeaturizer)

    sentence = "Hey how are you today ?"
    message = Message(sentence)
    tokens = tokenizer.tokenize(message, attribute=TEXT)
    tokens = tokenizer.add_cls_token(tokens, attribute=TEXT)
    message.set(TOKENS_NAMES[TEXT], tokens)

    featurizer.process(message, tf_hub_module=tokenizer.module)

    expected = np.array([2.2636216, -0.26475656, -1.1358104, -0.49751878, -1.3946456])
    expected_cls = np.array(
        [1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353]
    )

    vecs = message.get(DENSE_FEATURE_NAMES[TEXT])

    assert len(tokens) == len(vecs)
    assert np.allclose(vecs[0][:5], expected, atol=1e-5)
    assert np.allclose(vecs[-1][:5], expected_cls, atol=1e-5)
Пример #29
0
    def _get_tags(self, message: Message) -> Dict[Text, List[Text]]:
        """Get assigned entity tags of message."""
        tokens = train_utils.tokens_without_cls(message)
        tags = {}

        for tag_name in self.crf_order:
            if self.component_config[BILOU_FLAG]:
                bilou_key = bilou_utils.get_bilou_key_for_tag(tag_name)
                if message.get(bilou_key):
                    _tags = message.get(bilou_key)
                else:
                    _tags = [NO_ENTITY_TAG for _ in tokens]
            else:
                _tags = [
                    determine_token_labels(token,
                                           message.get(ENTITIES),
                                           attribute_key=tag_name)
                    for token in tokens
                ]
            tags[tag_name] = _tags

        return tags
Пример #30
0
def test_spacy_ner_extractor(component_builder, spacy_nlp):
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    ext = component_builder.create_component(_config.for_component(0), _config)
    example = Message(
        "anywhere in the West", {
            "intent": "restaurant_search",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the west")
        })
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 16,
        'extractor': 'SpacyEntityExtractor',
        'end': 20,
        'value': 'West',
        'entity': 'LOC',
        'confidence': None
    }

    # Test dimension filtering includes only specified dimensions

    example = Message(
        "anywhere in the West with Sebastian Thrun", {
            "intent": "example_intent",
            "entities": [],
            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun")
        })
    _config = RasaNLUModelConfig(
        {"pipeline": [{
            "name": "SpacyEntityExtractor"
        }]})
    _config.set_component_attr(0, dimensions=["PERSON"])
    ext = component_builder.create_component(_config.for_component(0), _config)
    ext.process(example, spacy_nlp=spacy_nlp)

    assert len(example.get("entities", [])) == 1
    assert example.get("entities")[0] == {
        'start': 26,
        'extractor': 'SpacyEntityExtractor',
        'end': 41,
        'value': 'Sebastian Thrun',
        'entity': 'PERSON',
        'confidence': None
    }