def test_spacy(spacy_nlp): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer component_config = {"use_cls_token": False} tk = SpacyTokenizer(component_config) text = "Forecast for lunch" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "Forecast", "for", "lunch", ] assert [t.lemma for t in tk.tokenize(spacy_nlp(text))] == [ "forecast", "for", "lunch", ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13] text = "hey ńöñàśçií how're you?" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "hey", "ńöñàśçií", "how", "'re", "you", "?", ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
def test_spacy(spacy_nlp): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer() text = "Forecast for lunch" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == \ ['Forecast', 'for', 'lunch'] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == \ [0, 9, 13] text = "hey ńöñàśçií how're you?" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == \ ['hey', 'ńöñàśçií', 'how', '\'re', 'you', '?'] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == \ [0, 4, 13, 16, 20, 23]
def test_spacy_add_cls_token(spacy_nlp): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer component_config = {"use_cls_token": True} tk = SpacyTokenizer(component_config) text = "Forecast for lunch" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "Forecast", "for", "lunch", CLS_TOKEN, ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.data.get("pos") for t in tokens] == expected_pos_tags
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer() message = Message(text) message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT_ATTRIBUTE) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_spacy(spacy_nlp): from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer tk = SpacyTokenizer() text = "Forecast for lunch" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "Forecast", "for", "lunch", ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13] text = "hey ńöñàśçií how're you?" assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [ "hey", "ńöñàśçií", "how", "'re", "you", "?", ] assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
def test_crf_create_entity_dict(spacy_nlp): crf_extractor = CRFEntityExtractor() spacy_tokenizer = SpacyTokenizer() white_space_tokenizer = WhitespaceTokenizer() examples = [ { "message": Message( "where is St. Michael's Hospital?", { "intent": "search_location", "entities": [ { "start": 9, "end": 31, "value": "St. Michael's Hospital", "entity": "hospital", "SpacyTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 5, }, "WhitespaceTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 5, }, } ], SPACY_DOCS[TEXT]: spacy_nlp("where is St. Michael's Hospital?"), }, ) }, { "message": Message( "where is Children's Hospital?", { "intent": "search_location", "entities": [ { "start": 9, "end": 28, "value": "Children's Hospital", "entity": "hospital", "SpacyTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 4, }, "WhitespaceTokenizer": { "entity_start_token_idx": 2, "entity_end_token_idx": 4, }, } ], SPACY_DOCS[TEXT]: spacy_nlp("where is Children's Hospital?"), }, ) }, ] for ex in examples: # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text spacy_tokens = spacy_tokenizer.tokenize(ex["message"], TEXT) white_space_tokens = white_space_tokenizer.tokenize(ex["message"], TEXT) for tokenizer, tokens in [ ("SpacyTokenizer", spacy_tokens), ("WhitespaceTokenizer", white_space_tokens), ]: for entity in ex["message"].get("entities"): parsed_entities = crf_extractor._create_entity_dict( ex["message"], tokens, entity[tokenizer]["entity_start_token_idx"], entity[tokenizer]["entity_end_token_idx"], entity["entity"], 0.8, ) assert parsed_entities == { "start": entity["start"], "end": entity["end"], "value": entity["value"], "entity": entity["entity"], "confidence": 0.8, }