def __init__(self, agent, kb, uuid): ''' Dialogue data that is needed by the model. ''' self.uuid = uuid self.agent = agent self.kb = kb self.role = kb.role partner_role = 'buyer' if self.role == 'seller' else 'seller' self.agent_to_role = { self.agent: self.role, 1 - self.agent: partner_role } # KB context # TODO: context_to_int will change category, title, description to integers self.category_str = kb.category self.category = kb.category self.title = tokenize( re.sub(r'[^\w0-9]', ' ', kb.facts['item']['Title'])) self.description = tokenize( re.sub(r'[^\w0-9]', ' ', ' '.join(kb.facts['item']['Description']))) # token_turns: tokens and entitys (output of entitylink) self.token_turns = [] # turns: input tokens of encoder, decoder input and target, later converted to integers self.turns = [[], [], []] # entities: has the same structure as turns, non-entity tokens are None self.entities = [] self.agents = [] self.roles = [] self.is_int = False # Whether we've converted it to integers self.token_candidates = None self.candidates = None self.true_candidate_inds = None
def __init__(self, agent, kb, uuid, model='seq2seq'): ''' Dialogue data that is needed by the model. ''' self.uuid = uuid self.agent = agent self.kb = kb self.model = model self.agent_to_role = self.get_role_mapping(agent, kb) # KB context # NOTE: context_to_int will change category, title, description to integers self.category_str = kb.category self.category = kb.category self.title = tokenize( re.sub(r'[^\w0-9]', ' ', kb.facts['item']['Title'])) self.description = tokenize( re.sub(r'[^\w0-9]', ' ', ' '.join(kb.facts['item']['Description']))) # token_turns: tokens and entitys (output of entity linking) self.token_turns = [] # parsed logical forms self.lfs = [] # turns: input tokens of encoder, decoder input and target, later converted to integers self.turns = [[], [], []] # entities: has the same structure as turns, non-entity tokens are None self.entities = [] self.agents = [] self.roles = [] self.is_int = False # Whether we've converted it to integers self.num_context = None
def process_event(self, e, kb, mentioned_entities=None, known_kb=True): ''' Convert event to two lists of tokens and entities for encoding and decoding. ''' if e.action == 'message': # Lower, tokenize, link entity entity_tokens = self.lexicon.link_entity( tokenize(e.data), kb=kb, mentioned_entities=mentioned_entities, known_kb=known_kb) #print e.data #print entity_tokens entity_tokens = [ normalize_number(x) if not is_entity(x) else x for x in entity_tokens ] if entity_tokens: # NOTE: have two copies because we might change it given decoding/encoding return (entity_tokens, copy.copy(entity_tokens)) else: return None elif e.action == 'select': # Convert an item to item-id (wrt to the speaker) item_id = self.get_item_id(kb, e.data) # We use the entities to represent the item during encoding and item-id during decoding return ([markers.SELECT] + self.item_to_entities(e.data, kb.attributes), [markers.SELECT, item_to_entity(item_id)]) else: raise ValueError('Unknown event action.')
def test(self, c, d, raw_utterance, lexicon): scenario = {'book': c[0], 'hat': c[1], 'ball': c[2]} proposal, _, _ = self.parse_proposal( lexicon.link_entity(tokenize(raw_utterance)), scenario) if not proposal: print('No offer detected: {}'.format(raw_utterance)) return False passed = True for i, item in enumerate(('book', 'hat', 'ball')): if proposal[self.ME][item] != d[i]: passed = False break if passed: print("Passed") else: print("TEST SCENARIO") print(" There are {0} books, {1} hats, and {2} balls.".format( c[0], c[1], c[2])) print(" Sentence: {0}".format(raw_utterance)) print("SYSTEM OUTPUT") print('For me:') print(proposal[self.ME]) print('For you:') print(proposal[self.YOU]) print( " The correct proposal is {0} books, {1} hats, and {2} balls". format(d[0], d[1], d[2])) print("------------------------------") return passed
def extract_keywords(self): if self.action == 'message': # Re-tokenize here because for tagging is case-sensitive tags = pos_tag(tokenize(self.text, lowercase=False)) self.keywords = [ word for word, tag in tags if re.match(r'NN*|ADJ*', tag) ]
def process_event(self, e, agent, sel=None): # Lower, tokenize, link entity if e.action == 'message': entity_tokens = self.lexicon.link_entity(tokenize(e.data)) return entity_tokens if entity_tokens else None elif e.action == 'select': ''' outcome is now handled separately with its own loss function ---- ABOVE IS OUTDATED ---- "sel" is short for selection, it will be a list of two agents where each agent is a dict with 3 keys: book, hat, ball ''' entity_tokens = [markers.SELECT] if e.agent == agent: entity_tokens.extend([ '{count}'.format(count=sel[agent][item]) for item in self.lexicon.items ]) ## First my selection, then partner selection #selections = [] #for agent in (e.agent, 1 - e.agent): # selections.extend([sel[agent][item] for item in self.lexicon.items]) #entity_tokens.extend([str(x) for x in selections]) return entity_tokens elif e.action == 'quit': entity_tokens = [markers.QUIT] return entity_tokens else: raise ValueError('Unknown event action.')
def parse_message(self, event, dialogue_state): tokens = self.lexicon.link_entity(tokenize(event.data)) utterance = Utterance(raw_text=event.data, tokens=tokens) intent = self.classify_intent(utterance) split = None proposal_type = None ambiguous_proposal = False if intent == 'propose': proposal, proposal_type, ambiguous_proposal = self.parse_proposal( utterance.tokens, self.kb.item_counts) if proposal: # NOTE: YOU/ME in proposal is from the partner's perspective split = { self.agent: proposal[self.YOU], self.partner: proposal[self.ME] } if dialogue_state.partner_proposal and split[ self.partner] == dialogue_state.partner_proposal[ self.partner]: intent = 'insist' lf = LF(intent, proposal=split, proposal_type=proposal_type) utterance.lf = lf utterance.template = self.extract_template(tokens, dialogue_state) utterance.ambiguous_template = ambiguous_proposal return utterance
def process_questions(questions, return_score_modifiers = False): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() prepared_questions.append(apply_bpe(tokenize(question)) if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): answers = detokenize(answers) answers = replace_in_answers(answers) answers = normalize_new_lines(answers) answers_score = score_answers(questions[index], answers) best_index, best_score = get_best_score(answers_score['score']) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) elif return_score_modifiers: prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score, 'score_modifiers': answers_score['score_modifiers']}) else: prepared_answers_list.append({'answers': answers, 'scores': answers_score['score'], 'best_index': best_index, 'best_score': best_score}) return prepared_answers_list
def process_questions(questions, include_blacklisted = True): # Make a list if not isinstance(questions, list): questions = [questions] # Clean and tokenize prepared_questions = [] for question in questions: question = question.strip() prepared_questions.append(tokenize(question) if question else '##emptyquestion##') # Run inference answers_list = inference_helper(prepared_questions) # Process answers prepared_answers_list = [] for index, answers in enumerate(answers_list): answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_score = score_answers(answers, 'answers') best_index, best_score = get_best_score(answers_score, include_blacklisted) if prepared_questions[index] == '##emptyquestion##': prepared_answers_list.append(None) else: prepared_answers_list.append({'answers': answers, 'scores': answers_score, 'best_index': best_index, 'best_score': best_score}) return prepared_answers_list
def process_event(self, e, kb): ''' Tokenize, link entities ''' if e.action == 'message': # Lower, tokenize, link entity entity_tokens = self.lexicon.link_entity(tokenize(e.data), kb=kb, scale=True, price_clip=4.) if entity_tokens: return entity_tokens else: return None elif e.action == 'offer': data = e.data['price'] if data is None: return None price = PriceScaler._scale_price(kb, data) entity_tokens = [markers.OFFER, self.price_to_entity(price)] return entity_tokens elif e.action == 'quit': entity_tokens = [markers.QUIT] return entity_tokens elif e.action == 'accept': entity_tokens = [markers.ACCEPT] return entity_tokens elif e.action == 'reject': entity_tokens = [markers.REJECT] return entity_tokens else: raise ValueError('Unknown event action.')
def get_total_tokens_per_agent(transcript): tokens = {0: 0., 1: 0.} for event in transcript["events"]: if event["action"] == "message": msg_tokens = tokenize(event["data"]) tokens[event["agent"]] += len(msg_tokens) return tokens
def count_words(cls, examples): counts = defaultdict(int) for ex in examples: for event in ex.events: if event.action == 'message': tokens = tokenize(event.data) for token in tokens: counts[token] += 1 return counts
def get_price_trend(cls, price_tracker, chat, agent=None): def _normalize_price(seen_price): return (float(seller_target) - float(seen_price)) / ( float(seller_target) - float(buyer_target)) scenario = NegotiationScenario.from_dict(None, chat['scenario']) # print chat['scenario'] kbs = scenario.kbs roles = { kbs[0].facts['personal']['Role']: 0, kbs[1].facts['personal']['Role']: 1 } buyer_target = kbs[roles[utils.BUYER]].facts['personal']['Target'] seller_target = kbs[roles[utils.SELLER]].facts['personal']['Target'] prices = [] for e in chat['events']: if e['action'] == 'message': if agent is not None and e['agent'] != agent: continue raw_tokens = tokenize(e['data']) # link entity linked_tokens = price_tracker.link_entity(raw_tokens, kb=kbs[e['agent']]) for token in linked_tokens: if isinstance(token, Entity): try: replaced = PriceScaler.unscale_price( kbs[e['agent']], token) except OverflowError: print "Raw tokens: ", raw_tokens print "Overflow error: {:s}".format(token) print kbs[e['agent']].facts print "-------" continue norm_price = _normalize_price(replaced.canonical.value) if 0. <= norm_price <= 2.: # if the number is greater than the list price or significantly lower than the buyer's # target it's probably not a price prices.append(norm_price) # do some stuff here elif e['action'] == 'offer': norm_price = _normalize_price(e['data']['price']) if 0. <= norm_price <= 2.: prices.append(norm_price) # prices.append(e['data']['price']) # print "Chat: {:s}".format(chat['uuid']) # print "Trend:", prices return prices
def skip_example(cls, example): tokens = {0: 0, 1: 0} turns = {0: 0, 1: 0} for event in example.events: if event.action == "message": msg_tokens = tokenize(event.data) tokens[event.agent] += len(msg_tokens) turns[event.agent] += 1 if tokens[0] < 40 and tokens[1] < 40: return True if turns[0] < 2 or turns[1] < 2: return True return False
def start_inference(question): global inference_helper, inference_object # Start inference, set global tuple with model, flags and hparams inference_object = do_start_inference(out_dir, hparams) # First inference() call calls that method # Now we have everything running, so replace inference() with actual function call inference_helper = lambda question: do_inference(tokenize(question), *inference_object) # Rerun inference() call return inference_helper(question)
def __init__(self, agent, kb, uuid, model='seq2seq'): """ Dialogue data that is needed by the model. :param agent: Index of the agent that will be the buyer in the scenario :param kb: Knowledge base for the current agent in the current scenario :param uuid: Scenario ID :param model: Model Type. Can be seq2seq for the word model, or lf2lf for the coarse dialogue act model Note: lf2lf stands for logical form to logical form """ self.uuid = uuid self.agent = agent self.kb = kb self.model = model self.agent_to_role = self.get_role_mapping(agent, kb) # KB context # NOTE: context_to_int will change category, title, description to integers self.category_str = kb.category self.category = kb.category self.title = tokenize( re.sub(r'[^\w0-9]', ' ', kb.facts['item']['Title'])) self.description = tokenize( re.sub(r'[^\w0-9]', ' ', ' '.join(kb.facts['item']['Description']))) # token_turns: tokens and entitys (output of entity linking) self.token_turns = [] # parsed logical forms self.lfs = [] # turns: input tokens of encoder, decoder input and target, later converted to integers self.turns = [[], [], []] # entities: has the same structure as turns, non-entity tokens are None self.entities = [] self.agents = [] self.roles = [] self.is_int = False # Whether we've converted it to integers self.num_context = None
def get_avg_tokens_per_agent(transcript): tokens = {0: 0., 1: 0.} utterances = {0: 0., 1: 0.} for event in transcript["events"]: if event["action"] == "message": msg_tokens = tokenize(event["data"]) tokens[event["agent"]] += len(msg_tokens) utterances[event["agent"]] += 1 if utterances[0] != 0: tokens[0] /= utterances[0] if utterances[1] != 0: tokens[1] /= utterances[1] return tokens
def skip_example(cls, example): """ Skip all examples that do not have enough tokens or turns to be a good example :return: True if both agents speak less then 40 tokens of if the dialogue has less than two turns """ tokens = {0: 0, 1: 0} turns = {0: 0, 1: 0} for event in example.events: if event.action == "message": msg_tokens = tokenize(event.data) tokens[event.agent] += len(msg_tokens) turns[event.agent] += 1 if tokens[0] < 40 and tokens[1] < 40: return True if turns[0] < 2 or turns[1] < 2: return True return False
def get_speech_acts(self, ex): stats = {0: [], 1: []} kbs = ex.kbs for e in ex.events: if e.action != 'message': continue sentences = self.split_turn(e.data.lower()) for s in sentences: tokens = tokenize(s) linked_tokens = self.price_tracker.link_entity(tokens, kb=kbs[e.agent]) act = SpeechActAnalyzer.get_speech_act(s, linked_tokens) stats[e.agent].append(act) return stats
def parse_message(self, event, dialogue_state): tokens = self.lexicon.link_entity(tokenize(event.data), kb=self.kb, scale=False) template = self.extract_template(tokens, dialogue_state) utterance = Utterance(raw_text=event.data, tokens=tokens) tokens_with_parsed_price = self.parse_prices(tokens, dialogue_state) intent = self.classify_intent(utterance, tokens_with_parsed_price, dialogue_state) proposed_price = self.get_proposed_price(tokens_with_parsed_price, dialogue_state) utterance.lf = LF(intent, price=proposed_price) utterance.template = template return utterance
def parser_stats(self, parsed_dialogues, agent=None): stats = {} non_entity_vocab = set() ents = set() stats['intents'] = defaultdict(int) intent_utterances = defaultdict(list) for dialogue in parsed_dialogues: for utterance in dialogue: if agent and utterance.agent != agent: continue if utterance.tokens is not None: tokens = [ x.canonical.type if is_entity(x) else x for x in utterance.tokens ] e = [x.surface for x in utterance.tokens if is_entity(x)] ents.update(e) non_entity_vocab.update(tokens) if utterance.lf and utterance.lf.intent != '<start>': stats['intents'][utterance.lf.intent] += 1 if utterance.text is not None: intent_utterances[utterance.lf.intent].append( tokenize(utterance.text)) stats['non_entity_vocab_size'] = len(non_entity_vocab) #print 'entities:', len(ents) #global no_ent_vocab #no_ent_vocab = non_entity_vocab #for x in all_vocab: # if not x in non_entity_vocab: # print x stats['intent_corpus_perplexity'] = self.intent_sequence_perplexity( intent_utterances) # Percentage intents #s = float(sum(stats['intents'].values())) #stats['intents'] = sorted( # [(k, v, v / s) for k, v in stats['intents'].iteritems()], # key=lambda x: x[1], reverse=True) self.print_stats(stats, 'parser stats') return stats
def process_event(self, e, kb): ''' Tokenize, link entities ''' from cocoa.core.event import Event # Process semi-event if not isinstance(e, Event): if len(e) < 2 or e[1] is None: return e else: # print('e is:', e) e[1] = PriceScaler._scale_price(kb, e[1]) return e if e.action == 'message': # Lower, tokenize, link entity entity_tokens = self.lexicon.link_entity(tokenize(e.data), kb=kb, scale=True, price_clip=4.) if entity_tokens: return entity_tokens else: return None elif e.action == 'offer': data = e.data['price'] if data is None: return None price = PriceScaler._scale_price(kb, data) entity_tokens = [markers.OFFER, self.price_to_entity(price)] return entity_tokens elif e.action == 'quit': entity_tokens = [markers.QUIT] return entity_tokens elif e.action == 'accept': entity_tokens = [markers.ACCEPT] return entity_tokens elif e.action == 'reject': entity_tokens = [markers.REJECT] return entity_tokens else: raise ValueError('Unknown event action.')
def _get_price_mentions(self, chat, agent=None): scenario = NegotiationScenario.from_dict(None, chat['scenario']) # print chat['scenario'] kbs = scenario.kbs prices = 0 for e in chat['events']: if agent is not None and e['agent'] != agent: continue if e['action'] == 'message': raw_tokens = tokenize(e['data']) # link entity linked_tokens = self.price_tracker.link_entity( raw_tokens, kb=kbs[e['agent']]) for token in linked_tokens: if isinstance(token, Entity) and token.canonical.type == 'price': prices += 1 return prices
def parse_message(self, event, dialogue_state): tokens = self.lexicon.link_entity( tokenize(event.data), kb=self.kb, mentioned_entities=dialogue_state.mentioned_entities, known_kb=False) utterance = Utterance(raw_text=event.data, tokens=tokens) intent = self.classify_intent(utterance) exclude_entities = [] entities = [] for i, token in enumerate(tokens): if is_entity(token): if i > 0 and tokens[i - 1] in self.neg_words: exclude_entities.append(token.canonical) else: entities.append(token.canonical) if len(entities) == 0 and len(exclude_entities) > 0: intent = 'negative' signature = '' if self.is_negative(utterance) and intent == 'inform': utterance.ambiguous_template = True elif entities: signature = self.signature(entities) elif exclude_entities: signature = self.signature(exclude_entities) if intent == 'negative' and not exclude_entities: exclude_entities = dialogue_state.my_entities lf = LF(intent, entities=entities, exclude_entities=exclude_entities, signature=signature) utterance.lf = lf utterance.template = self.extract_template(tokens, dialogue_state) return utterance
def example_stats(self, examples, agent=None): stats = {} stats['num_dialogues'] = len(examples) stats['num_turns_per_dialogue'] = np.mean( [len(e.events) for e in examples]) utterances = [tokenize(e.data) \ for example in examples \ for e in example.events if e.action == 'message' and (not agent or example.agents[e.agent] == agent)] stats['num_tokens_per_turn'] = np.mean([len(u) for u in utterances]) vocab = set() for u in utterances: vocab.update(u) stats['vocab_size'] = len(vocab) global all_vocab all_vocab = vocab stats['corpus_perplexity'] = self.sequence_perplexity(utterances) self.print_stats(stats, 'dataset stats') return stats
def inference_internal(self, question): answers = self.do_inference(tokenize(question)) answers = detokenize(answers) answers = replace_in_answers(answers, 'answers') answers_rate = score_answers(answers, 'answers') return (answers, answers_rate)
['no.AA', 'no. AA'], ['Mr. Daniel', 'Mr. Daniel'], ['mr. Daniel', 'mr. Daniel'], ['Mr.Daniel', 'Mr. Daniel'], ['mr.Daniel', 'mr. Daniel'], ['mrr. Daniel', 'mrr . Daniel'], ['test No.25 test No. 25 test mr. Daniel test', 'test No. 2 5 test No. 2 5 test mr. Daniel test'], ['https://www.youtube.com/watch?v=r8b0PWR1qxI', 'https : / / www.youtube.com / watch ? v = r 8 b 0 PWR 1 qxI'], ['www.example.com', 'www.example.com'], [':)', ': )'], ['word...', 'word ...'], ['360,678', '3 6 0 , 6 7 8'], ['360.678', '3 6 0 . 6 7 8'], ['Test phrase. Test phrase.', 'Test phrase . Test phrase .'], ['<unk>', ''], ['you\'re', 'you \' re'], ['you \'re', 'you \' re'], ['you\' re', 'you \' re'], ['1950\'s', '1 9 5 0 \' s'], ['`', '\''], ['\'\'', '"'], [':/', ': /'], ['^^^^^', '^ ^ ^ ^ ^'], ] init() for test in tests: tokenized = tokenize(test[0]) print('[{}] {} -> {}{}'.format(Fore.GREEN + 'PASS' + Fore.RESET if tokenized == test[1] else Fore.RED + 'FAIL' + Fore.RESET, test[0], test[1], '' if tokenized == test[1] else ' Result: {}'.format(tokenized)))
return command print("\nHello, Terra Home here your home assistant. How may I help you?") engine.say("Hello, Terra Home here, your home assistant. How may I help you?") engine.runAndWait() colorama.init() listen = True intents = ['song', 'email', 'weather', 'foursquare_explore', 'joke', 'bye'] # QAs while listen: question = myCommand() #question = input('> ') ques_tokens = tokenize(question) #try: #intent_resp = intent_classifier.message(question) resp = requests.get(url + question, headers={'Authorization': f'Bearer {ACCESS_TOKEN}'}) intent_resp = json.loads(resp.text) intent_resp_str = resp.text if 'intents' in intent_resp['entities']: intent = intent_resp['intents'][0]['name'] else: intent = 'chat' if intent not in intents: intent = 'chat'
def from_text(cls, raw_text, price_tracker, kb): tokens = price_tracker.link_entity(tokenize(raw_text), kb=kb, scale=False) return cls(raw_text, tokens)