Пример #1
0
    def get_entity_dict(self, turn_corpus):
        utt2act = {}
        for msg in turn_corpus:
            utt2act[" ".join(msg.utt[1:-1])] = msg

        dekenize = get_dekenize()
        utt2act = {dekenize(k.split()): v for k, v in utt2act.items()}
        self.logger.info("Compress utt2act from {}->{}".format(
            len(turn_corpus), len(utt2act)))

        # get entity value vocabulary
        domain_id2ent = defaultdict(set)
        for utt, msg in utt2act.items():
            for act in msg.actions:
                paras = act['parameters']
                intent = act['act']
                if intent == 'inform':
                    for v in paras[0].values():
                        domain_id2ent[msg.domain].add(str(v))
                elif intent == 'query':
                    for v in paras[0].values():
                        domain_id2ent[msg.domain].add(v)
                else:
                    for k, v in paras:
                        if v:
                            domain_id2ent[msg.domain].add(v)
        domain_id2ent = {k: list(v) for k, v in domain_id2ent.items()}
        return domain_id2ent
Пример #2
0
def dump_latent(model, data_feed, config, log_dir):
    model.eval()
    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, verbose=False, shuffle=False)
    logger.info("Dumping: {} batches".format(data_feed.num_batch))
    all_zs = []
    all_metas = []
    while True:
        batch = data_feed.next_batch()
        if batch is None:
            break
        results = model(batch, mode=TEACH_FORCE, return_latent=True)

        labels = batch.outputs
        domains = batch.domains
        acts = batch.get('output_actions')

        latent_acts = results.latent_actions
        if type(latent_acts) is tuple:
            latent_acts = list(latent_acts[0].cpu().data.numpy())
        else:
            latent_acts = list(latent_acts.cpu().data.numpy())

        for b_id in range(labels.shape[0]):
            true_str, _ = get_sent(model, de_tknize, labels, b_id)
            act_str, _ = get_sent(model, de_tknize, acts, b_id)
            all_metas.append({'utt': true_str, 'domain': domains[b_id], 'acts':act_str})

        all_zs.extend(latent_acts)

    pickle.dump({'z': all_zs, "metas": all_metas}, open(os.path.join(log_dir,
                                                                     "latent-{}.p".format(utils.get_time())), 'wb'))
    logger.info("Dumping Done")
Пример #3
0
    def get_intent_tagger(self, corpus):
        """
        :return: train a dialog act tagger for system utterances 
        """
        self.logger.info("Train a new intent tagger")
        all_tags, utts, tags = [], [], []
        de_tknize = get_dekenize()
        for msg in corpus:
            utts.append(de_tknize(msg.utt[1:-1]))
            tags.append([a['act'] for a in msg.actions])
            all_tags.extend([a['act'] for a in msg.actions])

        most_common = Counter(all_tags).most_common()
        self.logger.info(most_common)
        tag_set = [t for t, c, in most_common]
        rev_tag_set = {t: i for i, t in enumerate(tag_set)}

        # create train and test set:
        data_size = len(corpus)
        train_size = int(data_size * 0.7)
        train_utts = utts[0:train_size]
        test_utts = utts[train_size:]

        # create y:
        sparse_y = np.zeros([data_size, len(tag_set)])
        for idx, utt_tags in enumerate(tags):
            for tag in utt_tags:
                sparse_y[idx, rev_tag_set[tag]] = 1
        train_y = sparse_y[0:train_size, :]
        test_y = sparse_y[train_size:, :]

        # train classifier
        representation = CountVectorizer(ngram_range=[1, 2]).fit(train_utts)
        train_x = representation.transform(train_utts)
        test_x = representation.transform(test_utts)

        clf = OneVsRestClassifier(
            SGDClassifier(loss='hinge',
                          n_iter_no_change=10)).fit(train_x, train_y)
        pred_test_y = clf.predict(test_x)

        def print_report(score_name, scores, names):
            for s, n in zip(scores, names):
                self.logger.info("%s: %s -> %f" % (score_name, n, s))

        print_report('F1', metrics.f1_score(test_y, pred_test_y, average=None),
                     tag_set)

        x = representation.transform(utts)
        clf = OneVsRestClassifier(SGDClassifier(loss='hinge', n_iter_no_change=20)) \
            .fit(x, sparse_y)

        model_dump = {
            self.CLF: clf,
            self.REPRESENTATION: representation,
            self.ID2TAG: tag_set,
            self.TAG2ID: rev_tag_set
        }
        # pkl.dump(model_dump, open("{}.pkl".format(self.data_name), "w"))
        return model_dump
Пример #4
0
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None):
    model.eval()
    de_tknize = get_dekenize()

    def write(msg):
        if msg is None or msg == '':
            return
        if dest_f is None:
            logger.info(msg)
        else:
            dest_f.write(msg + '\n')

    data_feed.epoch_init(config, shuffle=num_batch is not None, verbose=False)
    evaluator.initialize()
    logger.info("Generation: {} batches".format(
        data_feed.num_batch if num_batch is None else num_batch))
    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type)

        # move from GPU to CPU
        labels = labels.cpu()
        pred_labels = [
            t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        pred_labels = np.array(pred_labels,
                               dtype=int).squeeze(-1).swapaxes(0, 1)
        true_labels = labels.data.numpy()
        # get attention if possible
        if config.use_attn or config.use_ptr:
            pred_attns = [
                t.cpu().data.numpy()
                for t in outputs[DecoderRNN.KEY_ATTN_SCORE]
            ]
            pred_attns = np.array(pred_attns,
                                  dtype=float).squeeze(2).swapaxes(0, 1)
        else:
            pred_attns = None

        # get last 1 context
        ctx = batch.get('contexts')
        ctx_len = batch.get('context_lens')
        domains = batch.domains
        attn_ctx = outputs.get(DecoderPointerGen.KEY_PTR_CTX)
        if attn_ctx is not None:
            attn_ctx = attn_ctx.cpu().data.numpy()
            attn_ctx = attn_ctx.reshape(attn_ctx.shape[0], -1)

        # logger.info the batch in String.
        for b_id in range(pred_labels.shape[0]):
            pred_str, attn = get_sent(model,
                                      de_tknize,
                                      pred_labels,
                                      b_id,
                                      attn=pred_attns,
                                      attn_ctx=attn_ctx)
            true_str, _ = get_sent(model, de_tknize, true_labels, b_id)
            prev_ctx = ""
            if ctx is not None:
                ctx_str, _ = get_sent(model, de_tknize,
                                      ctx[:, ctx_len[b_id] - 1, :], b_id)
                prev_ctx = "Source: {}".format(ctx_str)

            domain = domains[b_id]
            evaluator.add_example(true_str, pred_str, domain)
            if num_batch is None or num_batch <= 2:
                write(prev_ctx)
                write("{}:: True: {} ||| Pred: {}".format(
                    domain, true_str, pred_str))
                if attn:
                    write("[[{}]]".format(attn))

    write(evaluator.get_report(include_error=dest_f is not None))
    logger.info("Generation Done")