def align_tokens(tokens_in: List[Text], token_end: int, token_start: int) -> List[Token]: """Align sub-tokens of Language model with tokens return by the WhitespaceTokenizer. As a language model might split a single word into multiple tokens, we need to make sure that the start and end value of first and last sub-token matches the start and end value of the token return by the WhitespaceTokenizer as the entities are using those start and end values. """ tokens_out = [] current_token_offset = token_start for index, string in enumerate(tokens_in): if index == 0: if index == len(tokens_in) - 1: s_token_end = token_end else: s_token_end = current_token_offset + len(string) tokens_out.append(Token(string, token_start, end=s_token_end)) elif index == len(tokens_in) - 1: tokens_out.append( Token(string, current_token_offset, end=token_end)) else: tokens_out.append( Token(string, current_token_offset, end=current_token_offset + len(string))) current_token_offset += len(string) return tokens_out
def test_tokens_comparison(): x = Token("hello", 0) y = Token("Hello", 0) assert x == x assert y < x assert x != 1 with pytest.raises(TypeError): assert y < "a"
def test_tokens_comparison(): from rasa.nlu.tokenizers.tokenizer import Token x = Token("hello", 0) y = Token("Hello", 0) assert x == x assert y < x assert x != 1 with pytest.raises(TypeError): assert y < "a"
def test_count_vector_featurizer_using_tokens(tokens, expected): from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import ( CountVectorsFeaturizer, ) ftr = CountVectorsFeaturizer({ "token_pattern": r"(?u)\b\w+\b", "return_sequence": True }) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set("tokens", tokens_feature) # this is needed for a valid training example train_message.set("intent", "bla") data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set("tokens", tokens_feature) ftr.process(test_message) assert np.all( test_message.get("text_sparse_features").toarray()[0] == expected)
def test_lookup_tables_without_use_word_boundaries(sentence, tokens, expected, labeled_tokens): from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer from rasa.nlu.tokenizers.tokenizer import Token lookups = [ { "name": "cites", "elements": ["北京", "上海", "广州", "深圳", "杭州"], }, { "name": "dates", "elements": ["昨天", "今天", "明天", "后天"], }, ] ftr = RegexFeaturizer({"use_word_boundaries": False}) training_data = TrainingData() training_data.lookup_tables = lookups ftr.train(training_data) # adds tokens to the message message = Message(data={TEXT: sentence}) message.set(TOKENS_NAMES[TEXT], [Token(word, start) for (word, start) in tokens]) sequence_features, sentence_features = ftr._features_for_patterns( message, TEXT) assert np.allclose(sequence_features.toarray(), expected[:-1], atol=1e-10) assert np.allclose(sentence_features.toarray(), expected[-1], atol=1e-10) # the number of regex matches on each token should match for i, token in enumerate(message.get(TOKENS_NAMES[TEXT])): token_matches = token.get("pattern").values() num_matches = sum(token_matches) assert num_matches == labeled_tokens.count(i)
def test_encode_entities__with_entity_roles_and_groups(): # create fake message that has been tokenized and entities have been extracted text = "I am flying from London to Paris" tokens = [ Token(text=match.group(), start=match.start()) for match in re.finditer(r"\S+", text) ] entity_tags = ["city", f"city{ENTITY_LABEL_SEPARATOR}to"] entities = [ { ENTITY_ATTRIBUTE_TYPE: entity_tags[0], ENTITY_ATTRIBUTE_VALUE: "London", ENTITY_ATTRIBUTE_START: 17, ENTITY_ATTRIBUTE_END: 23, }, { ENTITY_ATTRIBUTE_TYPE: entity_tags[1], ENTITY_ATTRIBUTE_VALUE: "Paris", ENTITY_ATTRIBUTE_START: 27, ENTITY_ATTRIBUTE_END: 32, }, ] message = Message({ TEXT: text, TOKENS_NAMES[TEXT]: tokens, ENTITIES: entities }) # create a lookup table that has seen this message precomputations = MessageContainerForCoreFeaturization() precomputations.add(message) # instantiate matching domain and single state featurizer domain = Domain( intents=[], entities=entity_tags, slots=[], responses={}, forms={}, action_names=[], ) f = SingleStateFeaturizer() f.prepare_for_training(domain) # encode! encoded = f.encode_entities(entity_data={ TEXT: text, ENTITIES: entities }, precomputations=precomputations) # check assert len(f.entity_tag_specs) == 1 tags_to_ids = f.entity_tag_specs[0].tags_to_ids for idx, entity_tag in enumerate(entity_tags): tags_to_ids[entity_tag] = idx + 1 # hence, city -> 1, city#to -> 2 assert sorted(list(encoded.keys())) == [ENTITY_TAGS] assert np.all(encoded[ENTITY_TAGS][0].features == [[0], [0], [0], [0], [1], [0], [2]])
async def _get_e2e_entity_evaluation_result( processor: "MessageProcessor", tracker: DialogueStateTracker, prediction: PolicyPrediction, ) -> Optional[EntityEvaluationResult]: previous_event = tracker.events[-1] if isinstance(previous_event, SlotSet): # UserUttered events with entities can be followed by SlotSet events # if slots are defined in the domain previous_event = tracker.get_last_event_for( (UserUttered, ActionExecuted)) if isinstance(previous_event, UserUttered): entities_predicted_by_policies = [ entity for prediction_event in prediction.events if isinstance(prediction_event, EntitiesAdded) for entity in prediction_event.entities ] entity_targets = previous_event.entities if entity_targets or entities_predicted_by_policies: text = previous_event.text if text: parsed_message = await processor.parse_message( UserMessage(text=text)) if parsed_message: tokens = [ Token(text[start:end], start, end) for start, end in parsed_message.get( TOKENS_NAMES[TEXT], []) ] return EntityEvaluationResult( entity_targets, entities_predicted_by_policies, tokens, text) return None
def _token_from_offset( self, text: bytes, offset: int, encoded_sentence: bytes ) -> Token: return Token( text.decode(DEFAULT_ENCODING), self._byte_to_char_offset(encoded_sentence, offset), )
def test_create_train_load_and_process( create_lexical_syntactic_featurizer: Callable[ [Dict[Text, Any]], LexicalSyntacticFeaturizer ], default_model_storage: ModelStorage, default_execution_context: ExecutionContext, resource_lexical_syntactic_featurizer: Resource, feature_config: List[Text], ) -> Callable[..., LexicalSyntacticFeaturizer]: config = {"alias": "lsf", "features": feature_config} featurizer = create_lexical_syntactic_featurizer(config) sentence = "Hello how are you" tokens = [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] message = Message(data={TOKENS_NAMES[TEXT]: tokens}) featurizer.train(TrainingData([message])) loaded_featurizer = LexicalSyntacticFeaturizer.load( config={**LexicalSyntacticFeaturizer.get_default_config(), **config}, model_storage=default_model_storage, execution_context=default_execution_context, resource=resource_lexical_syntactic_featurizer, ) assert loaded_featurizer._feature_to_idx_dict == featurizer._feature_to_idx_dict
def test_only_featurizes_text_attribute( create_lexical_syntactic_featurizer: Callable[ [Dict[Text, Any]], LexicalSyntacticFeaturizer ] ): # build a message with tokens for lots of attributes sentence = "hello goodbye hello" tokens = [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] message_data = {} for attribute in MESSAGE_ATTRIBUTES + DENSE_FEATURIZABLE_ATTRIBUTES: message_data[attribute] = sentence message_data[TOKENS_NAMES[attribute]] = tokens message = Message(data=message_data) # train and process featurizer = create_lexical_syntactic_featurizer( {"alias": "lsf", "features": [["BOS"]]} ) featurizer.train(TrainingData([message])) featurizer.process([message]) assert len(message.features) == 1 assert message.features[0].attribute == TEXT
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT], tokens_feature) ftr.process(test_message) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_using_tokens( tokens: List[Text], expected: List[List[int]], create_featurizer: Callable[..., CountVectorsFeaturizerGraphComponent], ): ftr = create_featurizer() # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message(data={TEXT: ""}) train_message.set(TOKENS_NAMES[TEXT], tokens_feature) data = TrainingData([train_message]) ftr.train(data) ftr.process_training_data(data) seq_vec, sen_vec = train_message.get_sparse_features(TEXT, []) if seq_vec: seq_vec = seq_vec.features if sen_vec: sen_vec = sen_vec.features assert np.all(seq_vec.toarray()[0] == expected) assert sen_vec is not None
def test_count_vector_featurizer_using_tokens(tokens, expected): ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"}) # using empty string instead of real text string to make sure # count vector only can come from `tokens` feature. # using `message.text` can not get correct result tokens_feature = [Token(i, 0) for i in tokens] train_message = Message("") train_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) data = TrainingData([train_message]) ftr.train(data) test_message = Message("") test_message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens_feature) ftr.process(test_message) assert np.all( test_message.get(SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]).toarray()[0] == expected)
def test_process_multiple_messages( create_lexical_syntactic_featurizer: Callable[ [Dict[Text, Any]], LexicalSyntacticFeaturizer ] ): # build a message with tokens for lots of attributes multiple_messages = [] for sentence in ["hello", "hello there"]: tokens = [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] multiple_messages.append(Message(data={TOKENS_NAMES[TEXT]: tokens})) # train and process featurizer = create_lexical_syntactic_featurizer( {"alias": "lsf", "features": [["prefix2"]]} ) featurizer.train(TrainingData(multiple_messages)) featurizer.process(multiple_messages) for message in multiple_messages: assert len(message.features) == 1 assert message.features[0].attribute == TEXT # we know both texts where used for training if more than one feature has been # extracted e.g. for the first message from which only the prefix "he" can be # extracted assert multiple_messages[0].features[0].features.shape[-1] > 1
def test_warn_if_part_of_speech_features_cannot_be_computed( create_lexical_syntactic_featurizer: Callable[ [Dict[Text, Any]], LexicalSyntacticFeaturizer ], sentence: Text, feature_config: Dict[Text, Any], expected_features: np.ndarray, ): featurizer = create_lexical_syntactic_featurizer( {"alias": "lsf", "features": feature_config} ) # build the message - with tokens but *no* part-of-speech tags tokens = [ Token(text=match[0], start=match.start()) for match in re.finditer(r"\w+", sentence) ] message = Message(data={TOKENS_NAMES[TEXT]: tokens}) # train with pytest.warns( UserWarning, match="Expected training data to include tokens with part-of-speech tags", ): featurizer.train(TrainingData([message])) assert not message.features # process with pytest.warns(None) as records: featurizer.process([message]) assert len(records) == 0 assert len(message.features) == 1 feature = message.features[0] assert np.all(feature.features.todense() == expected_features)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: doc = self.get_doc(message, attribute) return [ Token(t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}) for t in doc ]
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import jieba text = message.get(attribute) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return tokens
def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) tokenized = [i for i in text] tokens = [] offset = 0 for word in tokenized: tokens.append(Token(word, offset)) offset += len(word) return tokens
def tokenize(self, message: Message, attribute: Text) -> List[Token]: """Tokenizes the text of the provided attribute of the incoming message.""" import jieba text = message.get(attribute) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return self._apply_token_pattern(tokens)
def test_ckip_featurizer(mock_POS_class): expected_pos_list = [[ 'Nd', 'Nd', 'VC', 'Di', 'Na', 'Na', 'VC', 'Di', 'Neu', 'Nf' ]] mock_POS_inst = mock_POS_class.return_value mock_POS_inst.return_value = expected_pos_list msg = Message.build(text="昨天晚上吃了牛肉燴飯花了120元", intent="eat_dinner") msg.set("tokens", [ Token("昨天", 0), Token("晚上", 2), Token("吃", 4), Token("了", 5), Token("牛肉", 6), Token("燴飯", 8), Token("花", 10), Token("了", 11), Token("120", 12), Token("元", 15) ]) from rukip.featurizer import CKIPFeaturizer component_config = {"model_path": "./data"} ckip_featurizer = CKIPFeaturizer(component_config) ner_features = ckip_featurizer.gen_ner_features(msg) assert ner_features == [['昨天', 'Nd'], ['晚上', 'Nd'], ['吃', 'VC'], ['了', 'Di'], ['牛肉', 'Na'], ['燴飯', 'Na'], ['花', 'VC'], ['了', 'Di'], ['120', 'Neu'], ['元', 'Nf']] component_config = {"model_path": "./data", "token_features": ["pos"]} ckip_featurizer = CKIPFeaturizer(component_config) ner_features = ckip_featurizer.gen_ner_features(msg) assert ner_features == [['Nd'], ['Nd'], ['VC'], ['Di'], ['Na'], ['Na'], ['VC'], ['Di'], ['Neu'], ['Nf']] component_config = {"model_path": "./data", "token_features": ["word"]} ckip_featurizer = CKIPFeaturizer(component_config) ner_features = ckip_featurizer.gen_ner_features(msg) assert ner_features == [['昨天'], ['晚上'], ['吃'], ['了'], ['牛肉'], ['燴飯'], ['花'], ['了'], ['120'], ['元']]
def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) doc = self.nlp(text) tokens = [ Token( text=t.text, start=t.idx, ) for t in doc if t.text and t.text.strip() ] return self._apply_token_pattern(tokens)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import jieba text = message.get(attribute) if self.component_config.get("case_sensitive", False): tokenized = jieba.tokenize(text.lower()) else: tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] return self._apply_token_pattern(tokens)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: from janome.tokenizer import Tokenizer text = message.get(attribute) text = self.removePunctuation(text) tokenizer = Tokenizer() tokenized = tokenizer.tokenize(text) tokens = [] for token in tokenized: tokens.append(Token(token.node.surface, token.node.pos - 1)) return self._apply_token_pattern(tokens)
def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]: import jieba text = self.preprocess_text(text, attribute) tokenized = jieba.tokenize(text) tokens = [Token(word, start) for (word, start, end) in tokenized] self.add_cls_token(tokens, attribute) return tokens
def test_align_token_features_convert(): tokens = [ Token("This", 0, data={NUMBER_OF_SUB_TOKENS: 1}), Token("is", 5, data={NUMBER_OF_SUB_TOKENS: 1}), Token("a", 8, data={NUMBER_OF_SUB_TOKENS: 1}), Token("sentence", 10, data={NUMBER_OF_SUB_TOKENS: 2}), Token("embedding", 19, data={NUMBER_OF_SUB_TOKENS: 4}), ] seq_dim = sum(t.get(NUMBER_OF_SUB_TOKENS) for t in tokens) token_features = np.random.rand(1, seq_dim, 64) actual_features = train_utils.align_token_features([tokens], token_features) assert np.all(actual_features[0][0] == token_features[0][0]) assert np.all(actual_features[0][1] == token_features[0][1]) assert np.all(actual_features[0][2] == token_features[0][2]) # sentence is split into 2 sub-tokens assert np.all(actual_features[0][3] == np.mean(token_features[0][3:5], axis=0)) # embedding is split into 4 sub-tokens assert np.all(actual_features[0][4] == np.mean(token_features[0][5:10], axis=0))
def tokenize(self, message: Message, attribute: Text) -> List[Token]: doc = self.get_doc(message, attribute) tokens = [ Token(t.text, t.idx, lemma=t.lemma_, data={POS_TAG_KEY: self._tag_of_token(t)}) for t in doc if t.text and t.text.strip() ] return self._apply_token_pattern(tokens)
def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) encoded_input = self.tokenizer(text, return_offsets_mapping=True, add_special_tokens=False) token_position_pair = zip(encoded_input.tokens(), encoded_input["offset_mapping"]) return [ Token(text=token_text, start=position[0], end=position[1]) for token_text, position in token_position_pair ]
def tokenize(self, text, msg_tokens): words = [] for token in msg_tokens.entities: words.append(token.value) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens
def tokenize(self, message: Message, attribute: Text) -> List[Token]: import MicroTokenizer text = message.get(attribute) tokenized = MicroTokenizer.cut(text) tokens = [] offset = 0 for word in tokenized: tokens.append(Token(word, offset)) offset += len(word) return tokens
def tokenize(self, message: Message, attribute: Text) -> List[Token]: text = message.get(attribute) if self.lang in ('zh', 'ja'): r = query_data_by_url(cf.servant_by_lang(self.lang), 'tokens', {'lang': self.lang, 'sents': text}) words = r['data'] running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) return tokens return super().tokenize(message, attribute)