Пример #1
0
 def process_utterance(self, utterance, stage=None):
     '''
     Input: utterance is a list of tokens, stage is either encoding, decoding or target
     Output: in most cases, stage will be declared. Based on a combination of
          the model_type and stage, we choose whether or not to summarize the
          utterance.  Models with "sum" should be summarized to only include
          selected keywords, models with "seq" will keep the full sequence.
     '''
     if stage is None:
         return [
             self.get_entity_form(x, 'canonical') if is_entity(x) else x
             for x in utterance
         ]
     else:
         if stage == 'encoding':
             summary = self.summarize(utterance) if self.model in [
                 "sum2sum", "sum2seq"
             ] else utterance
         elif (stage == 'decoding') or (stage == 'target'):
             if self.model == "sum2sum":
                 summary = self.summarize(utterance)
             elif self.model == "sum2seq":
                 summary = self.summarize(utterance)
                 summary.append(markers.END_SUM)
                 summary.extend(utterance)
             else:
                 summary = utterance
         return [
             self.get_entity_form(x, self.entity_forms[stage])
             if is_entity(x) else x for x in summary
         ]
Пример #2
0
 def __init__(self,
              model,
              vocab,
              temperature=1,
              max_length=100,
              cuda=False):
     super(LFSampler, self).__init__(model,
                                     vocab,
                                     temperature=temperature,
                                     max_length=max_length,
                                     cuda=cuda)
     self.price_actions = map(
         self.vocab.to_ind, ('init-price', 'counter-price', markers.OFFER))
     self.prices = set([
         id_ for w, id_ in self.vocab.word_to_ind.iteritems()
         if is_entity(w)
     ])
     self.price_list = list(self.prices)
     self.eos = self.vocab.to_ind(markers.EOS)
     # TODO: fix the hard coding
     actions = set([
         w for w in self.vocab.word_to_ind
         if not (is_entity(w) or w in category_markers
                 or w in sequence_markers or w in (vocab.UNK, '</sum>',
                                                   '<slot>', '</slot>'))
     ])
     self.actions = map(self.vocab.to_ind, actions)
Пример #3
0
 def send(self):
     # Don't send consecutive utterances with entities
     if self.sent_entity and not self.env.consecutive_entity:
         return None
     if self.matched_item is not None:
         return self.select(self.matched_item)
     for i in xrange(1):
         tokens = self.decode()
         if tokens is not None:
             break
     if tokens is None:
         return None
     if self._has_entity(tokens):
         self.sent_entity = True
     else:
         self.sent_entity = False
     for token in tokens:
         if is_entity(token):
             self.mentioned_entities.add(token[1][0])
     if self.env.realizer is None:
         tokens = [x if not is_entity(x) else x[0] for x in tokens]
     else:
         tokens = self.env.realizer.realize_entity(tokens)
     if len(tokens) > 1 and tokens[0] == markers.SELECT and tokens[1].startswith('item-'):
         item_id = int(tokens[1].split('-')[1])
         self.selected_items.add(item_id)
         item = self.kb.items[item_id]
         return self.select(item)
     tokens = self.naturalize(tokens)
     s = self.attach_punct(' '.join(tokens))
     return self.message(s)
Пример #4
0
 def process_utterance(self, utterance, stage=None):
     if stage is None:
         return [
             self.get_entity_form(x, 'canonical') if is_entity(x) else x
             for x in utterance
         ]
     else:
         return [
             self.get_entity_form(x, self.entity_forms[stage])
             if is_entity(x) else x for x in utterance
         ]
Пример #5
0
 def process_utterance(self, utterance, stage=None):
     # Input: utterance is a list of tokens, stage is either encoding, decoding or target
     if stage is None:
         return [
             self.get_entity_form(x, 'canonical') if is_entity(x) else x
             for x in utterance
         ]
     else:
         return [
             self.get_entity_form(x, self.entity_forms[stage])
             if is_entity(x) else x for x in utterance
         ]
Пример #6
0
 def _treebank_to_liwc_token(self, tokens):
     '''
     In LIWC dictinoary, "'re", "n't" etc are not separated.
     '''
     new_tokens = []
     for token in tokens:
         if not is_entity(token) and (
                 token.startswith("'") or token == "n't"
         ) and len(new_tokens) > 0 and not is_entity(new_tokens[-1]):
             new_tokens[-1] += token
         else:
             new_tokens.append(token)
     return new_tokens
Пример #7
0
 def combine_repeated_entity(self, entity_tokens):
     is_entity = lambda x: not isinstance(x, basestring)
     prev_entity = None
     max_dist = 1
     cache = []
     combined_entity_tokens = []
     for i, token in enumerate(entity_tokens):
         if is_entity(token):
             if prev_entity is not None and token[0] != prev_entity[
                     0] and token[1] == prev_entity[1] and (len(cache) <=
                                                            max_dist):
                 surface = '%s %s %s' % (prev_entity[0], ' '.join(cache),
                                         token[0])
                 combined_entity_tokens[-1] = (surface, prev_entity[1])
             else:
                 combined_entity_tokens.extend(cache)
                 combined_entity_tokens.append(token)
             prev_entity = token
             cache = []
         elif prev_entity is None:
             combined_entity_tokens.append(token)
         else:
             cache.append(token)
     combined_entity_tokens.extend(cache)
     return combined_entity_tokens
Пример #8
0
    def receive(self, event):
        #self.log.write('receive event:%s\n' % str(event.to_dict()))
        # Reset status
        self.sent_entity = False
        # Parse utterance
        if event.action == 'select':
            self.matched_item = self._match(event.data)
            if self.matched_item is None:
                entity_tokens = [markers.SELECT] + self.env.preprocessor.item_to_entities(event.data, self.kb.attributes)
            else:
                # Got a match; we're done.
                return
        elif event.action == 'message':
            entity_tokens = self.env.preprocessor.process_event(event, self.kb, mentioned_entities=self.mentioned_entities, known_kb=False)
            print entity_tokens[0]
            # Empty message
            if entity_tokens is None:
                return
            else:
                # Take the encoding version of sequence
                entity_tokens = entity_tokens[0]
        else:
            raise ValueError('Unknown event action %s.' % event.action)
        for token in entity_tokens:
            if is_entity(token):
                self.mentioned_entities.add(token[1][0])
        entity_tokens += [markers.EOS]

        self.encode(entity_tokens)
Пример #9
0
 def is_valid_action(self, action_tokens):
     if not action_tokens:
         return False
     if action_tokens[0] in self.price_actions and \
             not (len(action_tokens) > 1 and is_entity(action_tokens[1])):
         return False
     return True
Пример #10
0
    def _add_utterance(self, agent, utterance, lf=None):
        # Same agent talking
        if len(self.agents) > 0 and agent == self.agents[-1]:
            new_turn = False
        else:
            new_turn = True

        utterance = self._insert_markers(agent, utterance, new_turn)
        entities = [x if is_entity(x) else None for x in utterance]
        if lf:
            lf = self._insert_markers(agent, self.lf_to_tokens(self.kb, lf),
                                      new_turn)
        else:
            lf = []

        if new_turn:
            self.agents.append(agent)
            role = self.agent_to_role[agent]
            self.roles.append(role)

            self.token_turns.append(utterance)
            self.entities.append(entities)
            self.lfs.append(lf)
        else:
            self.token_turns[-1].extend(utterance)
            self.entities[-1].extend(entities)
            self.lfs[-1].extend(lf)
Пример #11
0
    def send(self):
        tokens = self.generate()
        if tokens is None:
            return None
        self.dialogue.add_utterance(self.agent, list(tokens))

        if len(tokens) > 1 and tokens[0] == markers.OFFER and is_entity(tokens[1]):
            try:
                price = self.builder.get_price_number(tokens[1], self.kb)
                return self.offer({'price': price})
            except ValueError:
                #return None
                pass
        tokens = self.builder.entity_to_str(tokens, self.kb)

        if len(tokens) > 0:
            if tokens[0] == markers.ACCEPT:
                return self.accept()
            elif tokens[0] == markers.REJECT:
                return self.reject()
            elif tokens[0] == markers.QUIT:
                return self.quit()

        s = self.attach_punct(' '.join(tokens))
        #print 'send:', s
        return self.message(s)
Пример #12
0
 def calculate_lengths(self, preds):
     total_len = len(preds)
     # TODO: this doesn't work with Marker class
     #marker_len = len([x for x in preds if x in markers])
     entity_len = len([x for x in preds if is_entity(x)])
     keyword_len = total_len - marker_len - entity_len
     return (total_len, keyword_len, marker_len, entity_len)
Пример #13
0
 def process_event(self, e, kb, mentioned_entities=None, known_kb=True):
     '''
     Convert event to two lists of tokens and entities for encoding and decoding.
     '''
     if e.action == 'message':
         # Lower, tokenize, link entity
         entity_tokens = self.lexicon.link_entity(
             tokenize(e.data),
             kb=kb,
             mentioned_entities=mentioned_entities,
             known_kb=known_kb)
         #print e.data
         #print entity_tokens
         entity_tokens = [
             normalize_number(x) if not is_entity(x) else x
             for x in entity_tokens
         ]
         if entity_tokens:
             # NOTE: have two copies because we might change it given decoding/encoding
             return (entity_tokens, copy.copy(entity_tokens))
         else:
             return None
     elif e.action == 'select':
         # Convert an item to item-id (wrt to the speaker)
         item_id = self.get_item_id(kb, e.data)
         # We use the entities to represent the item during encoding and item-id during decoding
         return ([markers.SELECT] +
                 self.item_to_entities(e.data, kb.attributes),
                 [markers.SELECT, item_to_entity(item_id)])
     else:
         raise ValueError('Unknown event action.')
Пример #14
0
def build_vocab(dialogues, special_symbols=[], entity_forms=[]):
    vocab = Vocabulary(offset=0, unk=True)

    def _add_entity(entity):
        for entity_form in entity_forms:
            # If copy entity embedding from the graph embedding, don't need entity in vocab
            if entity_form != 'graph':
                word = Preprocessor.get_entity_form(entity, entity_form)
                vocab.add_word(word)

    # Add words
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for turns in dialogue.token_turns:
            for turn in turns:
                for token in chain.from_iterable(turn):
                    if is_entity(token):
                        _add_entity(token)
                    else:
                        vocab.add_word(token)

    # Add special symbols
    vocab.add_words(special_symbols)
    print('Vocabulary size:', vocab.size)
    return vocab
Пример #15
0
    def log(self, sent_number):
        """
        Log translation to stdout.
        """
        user_utterance = ' '.join(
            [str(x) if is_entity(x) else x for x in self.src_raw])
        output = u'RAW INPUT: {}\n'.format(user_utterance)

        best_pred = self.pred_sents[0]
        best_score = self.pred_scores[0]
        pred_sent = ' '.join([str(x) for x in best_pred])
        output += 'PRED OUTPUT: {}\n'.format(pred_sent)
        # output += "PRED SCORE: {:.4f}\n".format(best_score)

        if self.gold_sent is not None:
            tgt_sent = ' '.join([str(x) for x in self.gold_sent])
            output += u'GOLD: {}\n'.format(tgt_sent)
            # gold score is always 0 because that is the highest possible
            # output += "GOLD SCORE: {:.4f}\n".format(self.gold_score)

        if len(self.pred_sents) > 1:
            output += 'BEST HYP:\n'
            for score, sent in zip(self.pred_scores, self.pred_sents):
                output += "[{:.4f}] {}\n".format(score, sent)

        output += "\n"

        return output
Пример #16
0
def build_utterance_vocab(dialogues,
                          special_symbols=[],
                          entity_forms=[],
                          except_words=[]):
    vocab = Vocabulary(offset=0, unk=True, except_words=except_words)

    def _add_entity(entity):
        for entity_form in entity_forms:
            word = get_entity_form(entity, entity_form)
            vocab.add_word(word)

    # Add words
    for dialogue in dialogues:
        assert dialogue.is_int is False
        for turn in dialogue.token_turns:
            for token in turn:
                if is_entity(token):
                    _add_entity(token)
                else:
                    vocab.add_word(token)

    # Add special symbols
    vocab.add_words(special_symbols, special=True)
    vocab.finish(size_threshold=10000)
    print('Utterance vocab size:', vocab.size)
    return vocab
Пример #17
0
    def get_policyHistogram(self):
        import numpy as np
        import matplotlib.pyplot as plt
        import seaborn as sns
        import re

        allNum = len(self.policy_history)
        tmpData = np.mean(self.policy_history, axis=0)[0]
        r = re.compile(u'\d+[.]?\d*')
        x, w = [], []
        for i in range(len(tmpData)):
            tmp = self.vocab.ind_to_word[i]
            if not is_entity(tmp):
                continue
            name = tmp.canonical.value
            if abs(name) > 10.1: continue
            x.append(name)
            w.append(tmpData[i])

        w = w / np.sum(w)
        from scipy.stats import norm
        sns.distplot(
            x,
            bins=100,
            kde=False,
            hist_kws={'weights': w},
        )
Пример #18
0
    def _tokens_to_event(self, tokens, output_data, semi_event=False):
        # if self.agent == 0 :
        #     try:
        #         tokens = [0, 0]
        #         tokens[0] = markers.OFFER
        #         tokens[1] = '$60'
        #     except ValueError:
        #         #return None
        #         pass

        if isinstance(tokens, tuple):
            tokens = list(tokens)

        if isinstance(tokens[1], float):
            tokens[1] = CanonicalEntity(type='price', value=tokens[1])

        if semi_event:
            # From scale to real price
            # print('semi_event: {}->'.format(tokens[1]),end='')
            if tokens[1] is not None:
                tokens[1] = self.builder.get_price_number(tokens[1], self.kb)
            # print('{}.'.format(tokens[1]))
            return tokens

        if isinstance(tokens[0], int):
            tokens[0] = self.env.vocab.to_word(tokens[0])


        if len(tokens) > 1 and tokens[0] == markers.OFFER and is_entity(tokens[1]):
            try:
                price = self.builder.get_price_number(tokens[1], self.kb)
                return self.offer({'price': price}, metadata={"output_data": output_data})
            except ValueError:
                # return None
                pass
        elif tokens[0] == markers.OFFER:
            assert False

        tokens = self.builder.entity_to_str(tokens, self.kb)

        if len(tokens) > 0:
            if tokens[0] == markers.ACCEPT:
                return self.accept(metadata={"output_data": output_data})
            elif tokens[0] == markers.REJECT:
                return self.reject(metadata={"output_data": output_data})
            elif tokens[0] == markers.QUIT:
                return self.quit(metadata={"output_data": output_data})

        while len(tokens) > 0 and tokens[-1] == None: tokens = tokens[:-1]
        s = self.attach_punct(' '.join(tokens))
        # print 'send:', s
        
        # print(">>> sender's intent: ", tokens)
        role = self.kb.facts['personal']['Role']
        category = self.kb.facts['item']['Category']
        real_uttr = self.uttr_gen(tokens, role, category)
        # print(">>> sender's uttr: ", real_uttr)

        return self.message(s, metadata={"output_data": output_data, "real_uttr": real_uttr})
Пример #19
0
 def label_liwc(self, liwc):
     for utterance in self.iter_utterances():
         if utterance.action == 'message':
             tokens = self._treebank_to_liwc_token(utterance.tokens)
             for token in ifilter(lambda x: not is_entity(x), tokens):
                 cats = liwc.lookup(token)
                 for cat in cats:
                     utterance.categories[cat][token] += 1
Пример #20
0
 def map_prices(self, entity_tokens):
     # NOTE: entities are CanonicalEntities, change to Entity
     entity_tokens = Dialogue.original_price(self.kb, entity_tokens)
     tokens = [
         str(x.canonical.value) if is_entity(x) else x
         for x in entity_tokens
     ]
     return tokens
Пример #21
0
 def _process_target_tokens(self, tokens):
     '''
     TODO: for now evaluate against canonical entities. In future, evaluate against
     actual utterances.
     '''
     targets = [token[1] if is_entity(token) else token for token in tokens]
     #targets = [x for x in targets if x not in (markers.PAD,)]
     return targets
Пример #22
0
 def parse_message(self, event, dialogue_state):
     tokens = self.lexicon.link_entity(event.data)
     tokens = [x.lower() if not is_entity(x) else x for x in tokens]
     utterance = Utterance(raw_text=event.data, tokens=tokens)
     intent = self.classify_intent(utterance, dialogue_state)
     template = self.extract_template(tokens, dialogue_state)
     utterance.lf = LF(intent, titles=self.get_entities(tokens, 'title'))
     utterance.template = template
     return utterance
Пример #23
0
 def parse_message(self, event, dialogue_state):
     tokens = self.lexicon.link_entity(event.data)
     tokens = [x.lower() if not is_entity(x) else x for x in tokens]
     utterance = Utterance(raw_text=event.data, tokens=tokens)
     intent = "placeholder_intent"
     template = self.extract_template(tokens, dialogue_state)
     utterance.lf = LF(intent, topic="placeholder")
     utterance.template = template
     return utterance
Пример #24
0
 def eval(self, kb, utterance):
     '''
     utterance: a list of tokens and entities represented as a tuple (surface_form, (caninical_form, type))
     '''
     #print 'eval:', utterance
     N = len(utterance)
     i = 0
     while i < N:
         token = utterance[i]
         if is_entity(token) and token[1][1] != 'item':
             self.inc_fact()
             if i + 1 < N and utterance[i + 1] == 'and':
                 # number ent1 and ent2
                 if i - 1 < 0 or i + 3 > N:
                     self.inc_undecided()
                     i += 1
                 else:
                     start, end = i - 1, i + 3
                     if not is_entity(utterance[i + 2]):
                         self.inc_undecided()
                     else:
                         if end + 1 < N and utterance[end:end +
                                                      2] == ['in', 'those']:
                             self.inc_coref()
                             i = end + 2
                         else:
                             self.eval_joint(kb, utterance[start:end])
                     i = end
             elif i - 1 > 0:
                 # number ent
                 start, end = i - 1, i + 1
                 if end + 1 < N and utterance[end:end +
                                              2] == ['in', 'those']:
                     self.inc_coref()
                     i = end + 2
                 else:
                     self.eval_single(kb, utterance[start:end])
                     i = end
             else:
                 self.inc_undecided()
                 i += 1
         else:
             i += 1
Пример #25
0
    def parser_stats(self, parsed_dialogues, agent=None):
        stats = {}
        non_entity_vocab = set()
        ents = set()
        stats['intents'] = defaultdict(int)
        intent_utterances = defaultdict(list)

        for dialogue in parsed_dialogues:
            for utterance in dialogue:
                if agent and utterance.agent != agent:
                    continue
                if utterance.tokens is not None:
                    tokens = [
                        x.canonical.type if is_entity(x) else x
                        for x in utterance.tokens
                    ]
                    e = [x.surface for x in utterance.tokens if is_entity(x)]
                    ents.update(e)
                    non_entity_vocab.update(tokens)
                if utterance.lf and utterance.lf.intent != '<start>':
                    stats['intents'][utterance.lf.intent] += 1
                if utterance.text is not None:
                    intent_utterances[utterance.lf.intent].append(
                        tokenize(utterance.text))
        stats['non_entity_vocab_size'] = len(non_entity_vocab)
        #print 'entities:', len(ents)
        #global no_ent_vocab
        #no_ent_vocab = non_entity_vocab
        #for x in all_vocab:
        #    if not x in non_entity_vocab:
        #        print x

        stats['intent_corpus_perplexity'] = self.intent_sequence_perplexity(
            intent_utterances)

        # Percentage intents
        #s = float(sum(stats['intents'].values()))
        #stats['intents'] = sorted(
        #        [(k, v, v / s) for k, v in stats['intents'].iteritems()],
        #        key=lambda x: x[1], reverse=True)

        self.print_stats(stats, 'parser stats')
        return stats
Пример #26
0
    def var_to_sent(self, variables, vocab=None):
        if not vocab:
            vocab = self.vocab

        sent_ids = variables.data.cpu().numpy()
        pad_id = vocab.to_ind(markers.PAD)
        sent_words = [vocab.to_word(x) for x in sent_ids if x != pad_id]
        sent_strings = [str(x) if is_entity(x) else x for x in sent_words]
        readable_sent = ' '.join(sent_strings)

        return readable_sent
Пример #27
0
 def process_turn(cls, turn):
     '''
     Process entities.
     '''
     # Represent price as "[x]" where x is the normalized value
     if len(turn) == 1 and turn[0] == markers.EOS:
         # NOTE: don't use <> because this is ignored by the analyzer
         tokens = ['_start_']
     else:
         tokens = ['_price_' if is_entity(x) else x for x in turn]
         tokens = tokens
     return ' '.join(tokens)
Пример #28
0
 def text_to_int(self, utterance, stage=None):
     '''
     Process entities in the utterance based on whether it is used for encoding, decoding
     or ground truth.
     '''
     if stage is not None:
         use_entity_map = self.setting[stage]
         tokens = self.preprocessor.process_utterance(utterance, stage)
         if not use_entity_map:
             return [self.vocab.to_ind(token) for token in tokens]
         else:
             offset = self.vocab.size
             return [
                 self.vocab.to_ind(token) if not is_entity(token) else
                 self.entity_map.to_ind(token) + offset for token in tokens
             ]
     else:
         tokens = self.preprocessor.process_utterance(utterance)
         offset = self.vocab.size
         return [
             self.vocab.to_ind(token) if not is_entity(token) else
             self.entity_map.to_ind(token) + offset for token in tokens
         ]
Пример #29
0
 def get_first_price(self, ex):
     agents = {1: None, 0: None}
     for e in ex.events:
         if e.action == 'message':
             for sent_tokens in e.tokens:
                 for token in sent_tokens:
                     if agents[1] and agents[0]:
                         return agents
                     # Return at the first mention
                     if is_entity(token):
                         price = token.canonical.value
                         agents[e.agent] = (e.role, price)
                         return agents
     return agents
Пример #30
0
 def extract_template(self, tokens, dialogue_state):
     template = []
     type_count = defaultdict(int)
     for token in tokens:
         if token in self.numbers or token in ('no', 'all'):
             template.append('{number}')
         elif is_entity(token):
             type_ = token.canonical.type
             template.append('{{{0}[{1}]}}'.format(type_,
                                                   type_count[type_]))
             type_count[type_] += 1
         else:
             template.append(token)
     return template