def test_spacy_pos_tags(text, expected_pos_tags, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.data.get("pos") for t in tokens] == expected_pos_tags
def test_spacy(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) tokens = tk.tokenize(message, attribute=TEXT) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_train_tokenizer(text, expected_tokens, expected_indices, spacy_nlp): tk = SpacyTokenizer(SpacyTokenizer.get_default_config()) message = Message.build(text=text) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) message.set(RESPONSE, text) message.set(SPACY_DOCS[RESPONSE], spacy_nlp(text)) training_data = TrainingData() training_data.training_examples = [message] tk.process_training_data(training_data) for attribute in [RESPONSE, TEXT]: tokens = training_data.training_examples[0].get( TOKENS_NAMES[attribute]) assert [t.text for t in tokens] == expected_tokens assert [t.start for t in tokens] == [i[0] for i in expected_indices] assert [t.end for t in tokens] == [i[1] for i in expected_indices]
def test_convert_training_examples( spacy_nlp: Language, text: Text, intent: Optional[Text], entities: Optional[List[Dict[Text, Any]]], attributes: List[Text], real_sparse_feature_sizes: Dict[Text, Dict[Text, List[int]]], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, ): message = Message(data={TEXT: text, INTENT: intent, ENTITIES: entities}) tokenizer = SpacyTokenizer.create( SpacyTokenizer.get_default_config(), default_model_storage, Resource("tokenizer"), default_execution_context, ) count_vectors_featurizer = CountVectorsFeaturizer.create( CountVectorsFeaturizer.get_default_config(), default_model_storage, Resource("count_featurizer"), default_execution_context, ) spacy_featurizer = SpacyFeaturizer.create( SpacyFeaturizer.get_default_config(), default_model_storage, Resource("spacy_featurizer"), default_execution_context, ) message.set(SPACY_DOCS[TEXT], spacy_nlp(text)) training_data = TrainingData([message]) tokenizer.process_training_data(training_data) count_vectors_featurizer.train(training_data) count_vectors_featurizer.process_training_data(training_data) spacy_featurizer.process_training_data(training_data) entity_tag_spec = [ EntityTagSpec( "entity", { 0: "O", 1: "name", 2: "location" }, { "O": 0, "name": 1, "location": 2 }, 3, ) ] output, sparse_feature_sizes = model_data_utils.featurize_training_examples( [message], attributes=attributes, entity_tag_specs=entity_tag_spec) assert len(output) == 1 for attribute in attributes: assert attribute in output[0] for attribute in {INTENT, TEXT, ENTITIES} - set(attributes): assert attribute not in output[0] # we have sparse sentence, sparse sequence, dense sentence, and dense sequence # features in the list assert len(output[0][TEXT]) == 4 if INTENT in attributes: # we will just have space sentence features assert len(output[0][INTENT]) == 1 if ENTITIES in attributes: # we will just have space sentence features assert len(output[0][ENTITIES]) == len(entity_tag_spec) # check that it calculates sparse_feature_sizes correctly assert sparse_feature_sizes == real_sparse_feature_sizes
def spacy_tokenizer() -> SpacyTokenizer: return SpacyTokenizer(SpacyTokenizer.get_default_config())