Пример #1
0
def test_spacy(spacy_nlp):
    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer

    component_config = {"use_cls_token": False}

    tk = SpacyTokenizer(component_config)

    text = "Forecast for lunch"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
        "Forecast",
        "for",
        "lunch",
    ]
    assert [t.lemma for t in tk.tokenize(spacy_nlp(text))] == [
        "forecast",
        "for",
        "lunch",
    ]

    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]

    text = "hey ńöñàśçií how're you?"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
        "hey",
        "ńöñàśçií",
        "how",
        "'re",
        "you",
        "?",
    ]
    assert [t.offset
            for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
Пример #2
0
def test_spacy(spacy_nlp):
    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
    tk = SpacyTokenizer()

    text = "Forecast for lunch"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == \
           ['Forecast', 'for', 'lunch']
    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == \
           [0, 9, 13]

    text = "hey ńöñàśçií how're you?"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == \
           ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?']
    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == \
           [0, 4, 13, 16, 20, 23]
Пример #3
0
def test_spacy_add_cls_token(spacy_nlp):
    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer

    component_config = {"use_cls_token": True}

    tk = SpacyTokenizer(component_config)

    text = "Forecast for lunch"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
        "Forecast",
        "for",
        "lunch",
        CLS_TOKEN,
    ]
    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
Пример #4
0
def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp):
    tk = SpacyTokenizer(SpacyTokenizer.get_default_config())

    message = Message.build(text=text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    tokens = tk.tokenize(message, attribute=TEXT)

    assert [t.data.get("pos") for t in tokens] == expected_pos_tags
Пример #5
0
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp):
    tk = SpacyTokenizer(SpacyTokenizer.get_default_config())

    message = Message.build(text=text)
    message.set(SPACY_DOCS[TEXT], spacy_nlp(text))

    tokens = tk.tokenize(message, attribute=TEXT)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Пример #6
0
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp):
    tk = SpacyTokenizer()

    message = Message(text)
    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text))

    tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE)

    assert [t.text for t in tokens] == expected_tokens
    assert [t.start for t in tokens] == [i[0] for i in expected_indices]
    assert [t.end for t in tokens] == [i[1] for i in expected_indices]
Пример #7
0
def test_spacy(spacy_nlp):
    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer

    tk = SpacyTokenizer()

    text = "Forecast for lunch"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
        "Forecast",
        "for",
        "lunch",
    ]
    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]

    text = "hey ńöñàśçií how're you?"
    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
        "hey",
        "ńöñàśçií",
        "how",
        "'re",
        "you",
        "?",
    ]
    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
Пример #8
0
def test_crf_create_entity_dict(spacy_nlp):
    crf_extractor = CRFEntityExtractor()
    spacy_tokenizer = SpacyTokenizer()
    white_space_tokenizer = WhitespaceTokenizer()

    examples = [
        {
            "message": Message(
                "where is St. Michael's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 31,
                            "value": "St. Michael's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 5,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"),
                },
            )
        },
        {
            "message": Message(
                "where is Children's Hospital?",
                {
                    "intent": "search_location",
                    "entities": [
                        {
                            "start": 9,
                            "end": 28,
                            "value": "Children's Hospital",
                            "entity": "hospital",
                            "SpacyTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                            "WhitespaceTokenizer": {
                                "entity_start_token_idx": 2,
                                "entity_end_token_idx": 4,
                            },
                        }
                    ],
                    SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"),
                },
            )
        },
    ]
    for ex in examples:
        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
        spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT)
        white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT)
        for tokenizer, tokens in [
            ("SpacyTokenizer", spacy_tokens),
            ("WhitespaceTokenizer", white_space_tokens),
        ]:
            for entity in ex["message"].get("entities"):
                parsed_entities = crf_extractor._create_entity_dict(
                    ex["message"],
                    tokens,
                    entity[tokenizer]["entity_start_token_idx"],
                    entity[tokenizer]["entity_end_token_idx"],
                    entity["entity"],
                    0.8,
                )
                assert parsed_entities == {
                    "start": entity["start"],
                    "end": entity["end"],
                    "value": entity["value"],
                    "entity": entity["entity"],
                    "confidence": 0.8,
                }