def check_subtokens( texts: List[Text], messages: List[Message], expected_number_of_sub_tokens: List[List[float]], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): """ Checks that we get the correct number of sub tokens """ for index, message in enumerate(messages): assert [ t.get(NUMBER_OF_SUB_TOKENS) for t in message.get(TOKENS_NAMES[TEXT]) ] == expected_number_of_sub_tokens[index] assert len(message.get(TOKENS_NAMES[TEXT])) == len( whitespace_tokenizer.tokenize(Message.build(text=texts[index]), TEXT) )
async def test_interpreter_parses_text_tokens( response_selector_interpreter: Interpreter, whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): text = "Hello there" tokens = whitespace_tokenizer.tokenize(Message(data={"text": text}), "text") indices = [(t.start, t.end) for t in tokens] parsed_data = response_selector_interpreter.parse(text) assert "text_tokens" in parsed_data.keys() parsed_tokens = parsed_data.get("text_tokens") assert parsed_tokens == indices
def test_convert_tags_to_entities( text: Text, tags: Dict[Text, List[Text]], confidences: Dict[Text, List[float]], expected_entities: List[Dict[Text, Any]], whitespace_tokenizer: WhitespaceTokenizerGraphComponent, ): extractor = EntityExtractorMixin() message = Message(data={TEXT: text}) tokens = whitespace_tokenizer.tokenize(message, TEXT) split_entities_config = {SPLIT_ENTITIES_BY_COMMA: True} actual_entities = extractor.convert_predictions_into_entities( text, tokens, tags, split_entities_config, confidences ) assert actual_entities == expected_entities