class EntityTaggerTest(unittest.TestCase): def setUp(self): self.trie = Trie() self.tagger = EntityTagger(self.trie, EnglishTokenizer()) self.trie.insert("play", "PlayVerb") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Not a Thing") def tearDown(self): pass def test_tag(self): tags = list(self.tagger.tag("play season 1 of the big bang theory")) assert len(tags) == 3 def test_regex_tag(self): regex = re.compile(r"the (?P<Event>\w+\s\w+) theory") tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex]) tags = tagger.tag("the big bang theory") assert len(tags) == 3 event_tags = [tag for tag in tags if tag.get('match') == 'big bang'] assert len(event_tags) == 1 assert len(event_tags[0].get('entities')) == 1 assert len(event_tags[0].get('entities')[0].get('data')) == 1 assert ('big bang', 'Event') in event_tags[0].get('entities')[0].get('data') def test_start_end_token_match_when_sorting_tagged_entities(self): repro_payload = [{"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 1, "key": "1", "entities": [{"key": "1", "data": [["1", "Which"]], "confidence": 0.5, "match": "1"}], "start_token": 1, "match": "1"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "SnoozeTime"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 4, "key": "20 minutes", "entities": [{"key": "20 minutes", "data": [["20 minutes", "SnoozeTime"]], "confidence": 0.5, "match": "20 minutes"}], "start_token": 3, "match": "20 minutes"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 3, "key": "20", "entities": [{"key": "20", "data": [["20", "Which"]], "confidence": 0.5, "match": "20"}], "start_token": 3, "match": "20"}, {"end_token": 0, "key": "snooze", "entities": [{"key": "snooze", "data": [["snooze", "SnoozeKeyword"]], "confidence": 1.0, "match": "snooze"}], "start_token": 0, "match": "snooze"}, {"end_token": 2, "key": "for", "entities": [{"key": "for", "data": [["for", "SnoozeFiller"]], "confidence": 1.0, "match": "for"}], "start_token": 2, "match": "for"}] # just asserting that the sort does not crash in py3 self.tagger._sort_and_merge_tags(repro_payload)
def test_regex_tag(self): regex = re.compile(r"the (?P<Event>\w+\s\w+) theory") tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex]) tags = tagger.tag("the big bang theory") assert len(tags) == 3 event_tags = [tag for tag in tags if tag.get('match') == 'big bang'] assert len(event_tags) == 1 assert len(event_tags[0].get('entities')) == 1 assert len(event_tags[0].get('entities')[0].get('data')) == 1 assert 'Event' in event_tags[0].get('entities')[0].get('data')
def setUp(self): self.tokenizer = EnglishTokenizer() self.trie = Trie(max_edit_distance=2) self.trie.insert("x-play", "Television Show") self.trie.insert("play", "Play Verb") self.trie.insert("play season", "Time Period") self.trie.insert("play", "Player Control") self.trie.insert("season", "Season Prefix") self.trie.insert("1", "Number") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Television Show") self.trie.insert("big bang", "event") self.trie.insert("bang theory", "Scientific Theory") self.tagger = EntityTagger(self.trie, self.tokenizer)
class EntityTaggerTest(unittest.TestCase): def setUp(self): self.trie = Trie() self.tagger = EntityTagger(self.trie, EnglishTokenizer()) self.trie.insert("play", "PlayVerb") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Not a Thing") def tearDown(self): pass def test_tag(self): tags = list(self.tagger.tag("play season 1 of the big bang theory")) assert len(tags) == 3 def test_regex_tag(self): regex = re.compile(r"the (?P<Event>\w+\s\w+) theory") tagger = EntityTagger(self.trie, EnglishTokenizer(), regex_entities=[regex]) tags = tagger.tag("the big bang theory") assert len(tags) == 3 event_tags = [tag for tag in tags if tag.get('match') == 'big bang'] assert len(event_tags) == 1 assert len(event_tags[0].get('entities')) == 1 assert len(event_tags[0].get('entities')[0].get('data')) == 1 assert 'Event' in event_tags[0].get('entities')[0].get('data')
def __init__(self, tokenizer=None, trie=None): pyee.EventEmitter.__init__(self) self.tokenizer = tokenizer or EnglishTokenizer() self.trie = trie or Trie() self.regular_expressions_entities = [] self._regex_strings = set() self.tagger = EntityTagger(self.trie, self.tokenizer, self.regular_expressions_entities) self.intent_parsers = []
def setUp(self): self.trie = Trie() self.tokenizer = EnglishTokenizer() self.regex_entities = [] self.tagger = EntityTagger(self.trie, self.tokenizer, regex_entities=self.regex_entities) self.trie.insert("play", ("play", "PlayVerb")) self.trie.insert("the big bang theory", ("the big bang theory", "Television Show")) self.trie.insert("the big", ("the big", "Not a Thing")) self.trie.insert("barenaked ladies", ("barenaked ladies", "Radio Station")) self.parser = Parser(self.tokenizer, self.tagger)
def __init__(self, tokenizer=None, trie=None): """ Initialize the IntentDeterminationEngine Args: tokenizer(tokenizer) : tokenizer used to break up spoken text example EnglishTokenizer() trie(Trie): tree of matches to Entites """ pyee.EventEmitter.__init__(self) self.tokenizer = tokenizer or EnglishTokenizer() self.trie = trie or Trie() self.regular_expressions_entities = [] self._regex_strings = set() self.tagger = EntityTagger(self.trie, self.tokenizer, self.regular_expressions_entities) self.intent_parsers = []
def test_intent_with_regex_entity(self): self.trie = Trie() self.tagger = EntityTagger(self.trie, self.tokenizer, self.regex_entities) self.parser = Parser(self.tokenizer, self.tagger) self.trie.insert("theory", ("theory", "Concept")) regex = re.compile(r"the (?P<Event>.*)") self.regex_entities.append(regex) intent = IntentBuilder("mock intent")\ .require("Event")\ .require("Concept").build() for result in self.parser.parse("the big bang theory"): result_intent = intent.validate(result.get('tags'), result.get('confidence')) assert result_intent.get('confidence') > 0.0 assert result_intent.get('Event') == 'big bang' assert result_intent.get('Concept') == "theory"
def __init__(self): self.trie = Trie() self.tokenizer = EnglishTokenizer() self.regex_entities = [] self.tagger = EntityTagger(self.trie, self.tokenizer, regex_entities=self.regex_entities) self.trie.insert("play", ("play", "PlayVerb")) self.trie.insert("play", ("play", "Command")) self.trie.insert("the big bang theory", ("the big bang theory", "Television Show")) self.trie.insert("all that", ("all that", "Television Show")) self.trie.insert("all that", ("all that", "Radio Station")) self.trie.insert("the big", ("the big", "Not a Thing")) self.trie.insert("barenaked ladies", ("barenaked ladies", "Radio Station")) self.trie.insert("show", ("show", "Command")) self.trie.insert("what", ("what", "Question")) self.parser = Parser(self.tokenizer, self.tagger) self.intent = IntentBuilder("Test Intent").require( "PlayVerb").one_of("Television Show", "Radio Station").build()
class BronKerboschExpanderTest(unittest.TestCase): def setUp(self): self.tokenizer = EnglishTokenizer() self.trie = Trie(max_edit_distance=2) self.trie.insert("x-play", "Television Show") self.trie.insert("play", "Play Verb") self.trie.insert("play season", "Time Period") self.trie.insert("play", "Player Control") self.trie.insert("season", "Season Prefix") self.trie.insert("1", "Number") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Television Show") self.trie.insert("big bang", "event") self.trie.insert("bang theory", "Scientific Theory") self.tagger = EntityTagger(self.trie, self.tokenizer) def testExpander(self): self.tagger.trie.max_edit_distance = 0 tags = self.tagger.tag("play season 1 of the big bang theory") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 6 def testExpandedResult(self): tags = self.tagger.tag("season 1") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 1 assert len(parse_results[0]) == 2 def testConsistentExpandWithSameOverlapMultipleTimes(self): """ example: play season 1 of the big bang theory play season one of the big bang theory series should contain two instances of the big bang theory :return: """ utterance = "play season 1 of the big bang theory" tags = self.tagger.tag(utterance) def score_clique(clique): score = 0.0 for tagged_entity in clique: ec = tagged_entity.get('entities', [{ 'confidence': 0.0 }])[0].get('confidence') score += ec * len( tagged_entity.get('entities', [{ 'match': '' }])[0].get('match')) / (len(utterance) + 1) return score expander = BronKerboschExpander(self.tokenizer) parse_results = list( expander.expand(tags, clique_scoring_func=score_clique)) assert len(parse_results) == 6 result_text = ' '.join( [tag.get('entities')[0].get('key') for tag in parse_results[0]]) result_parse = ', '.join([ tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0] ]) assert result_text == 'play season 1 the big bang theory' def testExpandWithRegexAndLiteralTokenMatch(self): # two tags for the same token, different confidence, should expand to two cliques tags = [{ 'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{ 'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell' }] }, { 'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{ 'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell' }] }] expander = BronKerboschExpander(self.tokenizer) cliques = list(expander._sub_expand(tags)) assert len(cliques) == 2
class AdaptTTIPlugin(plugin.TTIPlugin): tokenizer = EnglishTokenizer() trie = Trie() tagger = EntityTagger(trie, tokenizer) parser = Parser(tokenizer, tagger) engine = IntentDeterminationEngine() def add_word(self, intent, word): # Check if this is a collection if is_keyword(word): keyword_name = "{}_{}".format(intent, word[1:][:-1]) # print("Registering words for '{}'".format(keyword_name)) # This doesn't have to exist: if keyword_name in self.keywords: for keyword_word in self.keywords[keyword_name]['words']: # print("Registering '{}'".format(keyword_word)) self.engine.register_entity(keyword_word, keyword_name) if keyword_name in self.regex: for regex in self.regex[keyword_name]: self.engine.register_regex_entity(regex) else: # Just register the word as a required word self.keyword_index += 1 keyword_name = "{}_{}".format(intent, makeindex(self.keyword_index)) # print("Registering word '{}' as {}".format(word,keyword_name)) self.engine.register_entity(word, keyword_name) return keyword_name def add_intents(self, intents): for intent in intents: # print("Adding intent {}".format(intent)) # this prevents collisions between intents intent_base = intent intent_inc = 0 locale = profile.get("language") while intent in self.intent_map['intents']: intent_inc += 1 intent = "{}{}".format(intent_base, intent_inc) if ('locale' in intents[intent_base]): # If the selected locale is not available, try matching just # the language ("en-US" -> "en") if (locale not in intents[intent_base]['locale']): for language in intents[intent_base]['locale']: if (language[:2] == locale[:2]): locale = language break while intent in self.intent_map['intents']: intent_inc += 1 intent = "{}{}".format(intent_base, intent_inc) if ('keywords' in intents[intent_base]['locale'][locale]): for keyword in intents[intent_base]['locale'][locale][ 'keywords']: keyword_token = "{}_{}".format(intent, keyword) self.keywords[keyword_token] = { 'words': intents[intent_base]['locale'][locale]['keywords'] [keyword], 'name': keyword } if ('regex' in intents[intent_base]['locale'][locale]): for regex_name in intents[intent_base]['locale'][locale][ 'regex']: regex_token = "{}_{}".format(intent, regex_name) self.regex[regex_token] = [] for regex in intents[intent_base]['locale'][locale][ 'regex'][regex_name]: self.regex[regex_token].append( regex.replace(regex_name, regex_token)) # pprint(self.regex) self.intent_map['intents'][intent] = { 'action': intents[intent_base]['action'], 'name': intent_base, 'templates': [], 'words': {} } for phrase in intents[intent_base]['locale'][locale]['templates']: # Save the phrase so we can search for undefined keywords self.intent_map['intents'][intent]['templates'].append(phrase) # Make a count of word frequency. The fact that small connector # type words sometimes appear multiple times in a single # sentence while the focal words usually only appear once is # giving too much weight to those connector words. words = list(set(phrase.split())) for word in words: if not is_keyword(word): word = word.upper() # Count the number of times the word appears in this intent try: self.intent_map['intents'][intent]['words'][word][ 'count'] += 1 except KeyError: self.intent_map['intents'][intent]['words'][word] = { 'count': 1, 'weight': None, 'required': False } # Count the number of intents the word appears in try: self.words[word].update({intent: True}) except KeyError: self.words[word] = {intent: True} # for each word in each intent, divide the word frequency by the number of examples. # Since a word is only counted once per example, regardless of how many times it appears, # if the number of times it was counted matches the number of examples, then # this is a "required" word. phrase_count = len( intents[intent_base]['locale'][locale]['templates']) for word in self.intent_map['intents'][intent]['words']: # print("Word: '{}' Count: {} Phrases: {} Weight: {}".format(word, self.intent_map['intents'][intent]['words'][word], phrase_count, weight(self.intent_map['intents'][intent]['words'][word], phrase_count))) Weight = weight( self.intent_map['intents'][intent]['words'][word]['count'], phrase_count) self.intent_map['intents'][intent]['words'][word][ 'weight'] = Weight if Weight == 1: self.intent_map['intents'][intent]['words'][word][ 'required'] = True # Call train after loading all the intents. def train(self): # print("Words:") # pprint(self.words) # print("") # print("Intents:") # pprint(self.intent_map['intents']) # print("Keywords:") # pprint(self.keywords) for intent in self.intent_map['intents']: required_words = [] optional_words = [] # print("Training {}".format(intent)) # pprint(self.keywords) for word in self.intent_map['intents'][intent]['words']: intents_count = len(self.intent_map['intents']) word_appears_in = len(self.words[word]) # print("Word: {} Weight: {} Intents: {} Appears in: {}".format(word, weight, intents_count, word_appears_in)) self.intent_map['intents'][intent]['words'][word][ 'weight'] = self.intent_map['intents'][intent]['words'][ word]['weight'] * (intents_count - word_appears_in) / intents_count if (self.intent_map['intents'][intent]['words'][word] ['required']): # add the word as required. # print("adding '{}' as required".format(word_token)) required_words.append(self.add_word(intent, word)) else: # if the word is a keyword list, add it if (word[:1] + word[-1:] == "{}"): optional_words.append(self.add_word(intent, word)) else: if (self.intent_map['intents'][intent]['words'][word] ['weight'] > 0.35): # print("adding '{}' as optional".format(word_token)) optional_words.append(self.add_word(intent, word)) construction = IntentBuilder(intent) for keyword in required_words: # print("Required word: {}".format(keyword)) construction = construction.require(keyword) for keyword in optional_words: # print("Optional word: {}".format(keyword)) construction = construction.optionally(keyword) if (construction): # print("Building {}".format(intent)) self.engine.register_intent_parser(construction.build()) # pprint(self.intent_map['intents']) # print("") self.trained = True def get_plugin_phrases(self, passive_listen=False): phrases = [] # include the keyword, otherwise if (passive_listen): keywords = profile.get(["keyword"]) if not (isinstance(keywords, list)): keywords = [keywords] phrases.extend([word.upper() for word in keywords]) # Include any custom phrases (things you say to Naomi # that don't match plugin phrases. Otherwise, there is # a high probability that something you say will be # interpreted as a command. For instance, the # "check_email" plugin has only "EMAIL" and "INBOX" as # standard phrases, so every time I would say # "Naomi, check email" Naomi would hear "NAOMI SHUT EMAIL" # and shut down. custom_standard_phrases_file = paths.data( "standard_phrases", "{}.txt".format(profile.get(['language'], 'en-US'))) if (os.path.isfile(custom_standard_phrases_file)): with open(custom_standard_phrases_file, mode='r') as f: for line in f: phrase = line.strip() if phrase: phrases.append(phrase) # for plugin in self._plugins: for intent in self.intent_map['intents']: if ('templates' in self.intent_map['intents'][intent]): templates = self.intent_map['intents'][intent]['templates'] keywords_list = [keyword for keyword in self.keywords] # print("Keywords: {}".format(keywords_list)) for keyword in keywords_list: # This will not replace keywords that do not have a list associated with them, like regex and open keywords # print("Replacing {} with words from {} in templates".format(keyword,keywords[keyword])) if (keyword[:len(intent) + 1] == "{}_".format(intent)): short_keyword = self.keywords[keyword]['name'] for template in templates: # print("Checking template: {} for keyword {}".format(template,short_keyword)) if (to_keyword(short_keyword) in template): templates.extend([ template.replace(to_keyword(short_keyword), word.upper()) for word in self.keywords[keyword]['words'] ]) # Now that we have expanded every instance of keyword in templates, delete any template that still contains keyword templates = [ template for template in templates if not to_keyword(short_keyword) in template ] phrases.extend(templates) return sorted(phrases) def determine_intent(self, phrase): response = {} try: for intent in self.engine.determine_intent(phrase): if intent and intent.get("confidence") > 0: keywords = {} for keyword in intent: if keyword not in [ 'confidence', 'intent_type', 'target' ]: if keyword in self.keywords: # Since the Naomi parser can return a list of matching words, # this needs to be a list keywords[self.keywords[keyword]['name']] = [ intent[keyword] ] response.update({ self.intent_map['intents'][intent['intent_type']]['name']: { 'action': self.intent_map['intents'][intent['intent_type']] ['action'], 'input': phrase, 'matches': keywords, 'score': intent['confidence'] } }) except ZeroDivisionError: print("Could not determine an intent") return response
class BronKerboschExpanderTest(unittest.TestCase): def setUp(self): self.tokenizer = EnglishTokenizer() self.trie = Trie(max_edit_distance=2) self.trie.insert("x-play", "Television Show") self.trie.insert("play", "Play Verb") self.trie.insert("play season", "Time Period") self.trie.insert("play", "Player Control") self.trie.insert("season", "Season Prefix") self.trie.insert("1", "Number") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Television Show") self.trie.insert("big bang", "event") self.trie.insert("bang theory", "Scientific Theory") self.tagger = EntityTagger(self.trie, self.tokenizer) def testExpander(self): self.tagger.trie.max_edit_distance = 0 tags = self.tagger.tag("play season 1 of the big bang theory") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 6 def testExpandedResult(self): tags = self.tagger.tag("season 1") expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags)) assert len(parse_results) == 1 assert len(parse_results[0]) == 2 def testConsistentExpandWithSameOverlapMultipleTimes(self): """ example: play season 1 of the big bang theory play season one of the big bang theory series should contain two instances of the big bang theory :return: """ utterance = "play season 1 of the big bang theory" tags = self.tagger.tag(utterance) def score_clique(clique): score = 0.0 for tagged_entity in clique: ec = tagged_entity.get('entities', [{'confidence': 0.0}])[0].get('confidence') score += ec * len(tagged_entity.get('entities', [{'match': ''}])[0].get('match')) / ( len(utterance) + 1) return score expander = BronKerboschExpander(self.tokenizer) parse_results = list(expander.expand(tags, clique_scoring_func=score_clique)) assert len(parse_results) == 6 result_text = ' '.join([tag.get('entities')[0].get('key') for tag in parse_results[0]]) result_parse = ', '.join( [tag.get('entities')[0].get('data')[0][1] for tag in parse_results[0]] ) assert result_text == 'play season 1 the big bang theory' def testExpandWithRegexAndLiteralTokenMatch(self): # two tags for the same token, different confidence, should expand to two cliques tags = [{'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{'confidence': 0.5, 'data': [u'SearchTerms'], 'match': u'spell', 'key': u'spell'}]}, {'end_token': 0, 'start_token': 0, 'key': u'spell', 'match': u'spell', 'entities': [{'confidence': 1.0, 'data': [u'SpellingKeyword'], 'match': u'spell', 'key': u'spell'}]}] expander = BronKerboschExpander(self.tokenizer) cliques = list(expander._sub_expand(tags)) assert len(cliques) == 2
def setUp(self): self.trie = Trie() self.tagger = EntityTagger(self.trie, EnglishTokenizer()) self.trie.insert("play", "PlayVerb") self.trie.insert("the big bang theory", "Television Show") self.trie.insert("the big", "Not a Thing")
def tagger(self): return EntityTagger(self.trie, self.tokenizer, self.regular_expressions_entities)
PYTHONPATH=. python examples/multi_intent_parser.py "what's the weather like in tokyo" PYTHONPATH=. python examples/multi_intent_parser.py "play some music by the clash" """ import json import sys from adapt.entity_tagger import EntityTagger from adapt.tools.text.tokenizer import EnglishTokenizer from adapt.tools.text.trie import Trie from adapt.intent import IntentBuilder from adapt.parser import Parser from adapt.engine import DomainIntentDeterminationEngine tokenizer = EnglishTokenizer() trie = Trie() tagger = EntityTagger(trie, tokenizer) parser = Parser(tokenizer, tagger) engine = DomainIntentDeterminationEngine() engine.register_domain('Domain1') engine.register_domain('Domain2') # define vocabulary weather_keyword = ["weather"] for wk in weather_keyword: engine.register_entity(wk, "WeatherKeyword", domain='Domain1') weather_types = ["snow", "rain", "wind", "sleet", "sun"]
from adapt.tools.text.tokenizer import EnglishTokenizer from adapt.entity_tagger import EntityTagger from adapt.tools.text.trie import Trie import pprint tokenizer = EnglishTokenizer() trie = Trie() trie.insert("x-play", "Television Show") trie.insert("play", "Play Verb") trie.insert("play season", "Time Period") trie.insert("play", "Player Control") trie.insert("season", "Season Prefix") trie.insert("1", "Number") trie.insert("the big bang theory", "Television Show") trie.insert("the big", "Television Show") trie.insert("big bang", "event") trie.insert("bang theory", "Scientific Theory") tagger = EntityTagger(trie, tokenizer) tags = tagger.tag("play season 2 the big 1 of the big bang theory") Bke = BronKerboschExpander(tokenizer) graphA = Bke._build_graph(tags) for v in graphA.vertex_set(): print("vertex", v, list(graphA.get_neighbors_of(v))) parse_results = list(Bke.expand(tags)) for r in parse_results: print("Bke ----- ") for x in r: pprint.pprint(x) # print "Results X ",parse_results # print "GraphA",graphA # print "vertexes",graphA.vertex_set()