def build_user_utterance( text: str, utterance_tokenizer: UtteranceTokenizer) -> UserUtterance: text = clean_utterance_text(text) if not text: return UserUtterance(original_text=SpecialStrings.NULL, tokens=[SpecialStrings.NULL]) return UserUtterance(original_text=text, tokens=utterance_tokenizer.tokenize(text))
def build_agent_utterance(text: str, utterance_tokenizer: UtteranceTokenizer, described_entities: List[str]) -> AgentUtterance: text = clean_utterance_text(text) if not text: return AgentUtterance( original_text=SpecialStrings.NULL, tokens=[SpecialStrings.NULL], described_entities=described_entities, ) return AgentUtterance( original_text=text, tokens=utterance_tokenizer.tokenize(text), described_entities=described_entities, )
def test_tokenize_utterance(): utterance_tokenizer = UtteranceTokenizer() data = [ ( "Reschedule meeting with Barack Obama to 5/30/2019 at 3:00pm", [ "Reschedule", "meeting", "with", "Barack", "Obama", "to", "5", "/", "30", "/", "2019", "at", "3", ":", "00", "pm", ], ), ( "Can you also add icecream birthday tomorrow at 6PM?", [ "Can", "you", "also", "add", "icecream", "birthday", "tomorrow", "at", "6", "PM", "?", ], ), ] for text, expected in data: assert utterance_tokenizer.tokenize(text) == expected