Exemplo n.º 1
0
def dump_latent(model, data_feed, config, dest_f, num_batch=1):
    model.eval()
    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, verbose=False, shuffle=False)
    logger.info("Dumping: {} batches".format(data_feed.num_batch
                                                if num_batch is None
                                                else num_batch))
    all_zs = []
    all_labels = []
    all_metas = []
    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        results = model(batch, mode=TEACH_FORCE, return_latent=True)

        labels = batch.outputs
        metas = batch.metas
        log_qy = results.log_qy.cpu().squeeze(0).data
        y_ids = results.y_ids.cpu().data
        dec_init = results.dec_init_state.cpu().squeeze().data

        for b_id in range(labels.shape[0]):
            true_str, _ = engine.get_sent(model, de_tknize, labels, b_id)
            all_labels.append(true_str)
            all_metas.append(metas[b_id])

        all_zs.append((log_qy.numpy(), dec_init.numpy(), y_ids.numpy()))

    pickle.dump({'z': all_zs, 'labels': all_labels, "metas": all_metas}, dest_f)
    logger.info("Dumping Done")
Exemplo n.º 2
0
def sweep(model, data_feed, config, num_batch=1, dest_f=None):
    model.eval()
    old_batch_size = config.batch_size

    if num_batch != None:
        config.batch_size = 10

    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, shuffle=False, verbose=False)
    config.batch_size = old_batch_size

    logger.info("Generation: {} batches".format(
        data_feed.num_batch if num_batch is None else num_batch))

    def write(msg):
        if dest_f is None:
            logger.info(msg)
        else:
            dest_f.write(msg + '\n')

    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        outputs, labels, all_y_ids = model.sweep(batch,
                                                 gen_type=config.gen_type)
        # move from GPU to CPU
        true_labels = labels.cpu().data.numpy()
        all_y_ids = all_y_ids.cpu().data.numpy()

        pred_labels = [
            t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        pred_labels = np.array(pred_labels,
                               dtype=int).squeeze(-1).swapaxes(0, 1)
        # get attention if possible
        pred_attns = None

        true_str, _ = engine.get_sent(model, de_tknize, true_labels, 0)
        write("Start: {}".format(true_str))
        prev_code = None
        for b_id in range(pred_labels.shape[0]):
            pred_str, attn = engine.get_sent(model,
                                             de_tknize,
                                             pred_labels,
                                             b_id,
                                             attn=pred_attns)
            code = '-'.join(map(str, all_y_ids[b_id]))
            if prev_code != code:
                write("Predict ({}): {}".format(code[:10], pred_str))
                prev_code = code

        true_str, _ = engine.get_sent(model, de_tknize, true_labels,
                                      true_labels.shape[0] - 1)
        write("End: {}\n".format(true_str))

    logger.info("Generation Done")
Exemplo n.º 3
0
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None):
    model.eval()
    old_batch_size = config.batch_size

    if num_batch != None:
        config.batch_size = 5

    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, shuffle=False, verbose=False)
    config.batch_size = old_batch_size

    evaluator.initialize()
    logger.info("Generation: {} batches".format(data_feed.num_batch
                                                if num_batch is None
                                                else num_batch))
    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type)
        # move from GPU to CPU
        pred_labels = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]]
        pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0,1)
        true_labels = labels.cpu().data.numpy()
        # get attention if possible
        if config.use_attn:
            pred_attns = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_ATTN_SCORE]]
            pred_attns = np.array(pred_attns, dtype=float).squeeze(2).swapaxes(0,1)
        else:
            pred_attns = None

        ctx = batch.get('contexts')
        ctx_size = ctx.shape[1]
        for b_id in range(pred_labels.shape[0]):
            pred_str, attn = engine.get_sent(model, de_tknize, pred_labels,
                                             b_id, attn=pred_attns)
            ctx_str = []
            for i in range(ctx_size):
                temp, _ = engine.get_sent(model, de_tknize, ctx[:, i, 1:], b_id)
                if temp:
                    ctx_str.append(temp)
            ctx_str = '<t>'.join(ctx_str)
            true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id)
            evaluator.add_example(true_str, pred_str)
            if dest_f is None:
                logger.info("Source: {}".format(ctx_str))
                logger.info("Target: {}".format(true_str))
                logger.info("Predict: {}\n".format(pred_str))
            else:
                dest_f.write("Source: {}\n".format(ctx_str))
                dest_f.write("Target: {}\n".format(true_str))
                dest_f.write("Predict: {}\n\n".format(pred_str))
    if dest_f is None:
        logging.info(evaluator.get_report(include_error=dest_f is not None))
    else:
        dest_f.write(evaluator.get_report(include_error=dest_f is not None))
    logger.info("Generation Done")
Exemplo n.º 4
0
def selective_generate(model, data_feed, config, selected_clusters):
    model.eval()
    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, shuffle=False, verbose=False)
    # get all code
    codes = set([d['code'] for d in selected_clusters])

    logger.info("Generation: {} batches".format(data_feed.num_batch))
    data = []
    total_cnt = 0.0
    in_cnt = 0.0

    while True:
        batch = data_feed.next_batch()
        if batch is None:
            break
        outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type)
        # move from GPU to CPU
        pred_labels = [
            t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        pred_labels = np.array(pred_labels,
                               dtype=int).squeeze(-1).swapaxes(0, 1)
        true_labels = labels.cpu().data.numpy()
        y_ids = outputs[DecoderRNN.KEY_LATENT].cpu().data.numpy()
        y_ids = y_ids.reshape(-1, config.y_size)
        ctx = batch.get('contexts')
        ctx_size = ctx.shape[1]

        for b_id in range(pred_labels.shape[0]):
            y_id = map(str, y_ids[b_id])
            code = '-'.join(y_id)
            total_cnt += 1
            if code in codes:
                pred_str, attn = engine.get_sent(model,
                                                 de_tknize,
                                                 pred_labels,
                                                 b_id,
                                                 attn=None)
                ctx_str = []
                for i in range(ctx_size):
                    temp, _ = engine.get_sent(model, de_tknize, ctx[:, i, 1:],
                                              b_id)
                    ctx_str.append(temp)
                ctx_str = '<t>'.join(ctx_str)
                true_str, _ = engine.get_sent(model, de_tknize, true_labels,
                                              b_id)
                in_cnt += 1
                data.append({
                    'context': ctx_str,
                    'target': true_str,
                    'predict': pred_str,
                    'code': code
                })

    logger.info("In rate {}".format(in_cnt / total_cnt))
    return data
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None):
    model.eval()
    old_batch_size = config.batch_size

    if num_batch != None:
        config.batch_size = 3

    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, shuffle=False, verbose=False)
    config.batch_size = old_batch_size

    evaluator.initialize()
    logger.info("Generation: {} batches".format(
        data_feed.num_batch if num_batch is None else num_batch))
    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type)
        # move from GPU to CPU
        labels = labels.cpu()
        pred_labels = [
            t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        pred_labels = np.array(pred_labels,
                               dtype=int).squeeze(-1).swapaxes(0, 1)
        true_labels = labels.data.numpy()
        # get attention if possible
        pred_attns = None

        for b_id in range(pred_labels.shape[0]):
            pred_str, attn = engine.get_sent(model,
                                             de_tknize,
                                             pred_labels,
                                             b_id,
                                             attn=pred_attns)
            true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id)
            evaluator.add_example(true_str, pred_str)
            if dest_f is None:
                logger.info("Target: {}".format(true_str))
                logger.info("Predict: {}\n".format(pred_str))
            else:
                # dest_f.write("Target: {}\n".format(true_str))
                # dest_f.write("Predict: {}\n\n".format(pred_str))
                dest_f.write("Target: {}\n".format(true_str).encode())
                dest_f.write("Predict: {}\n\n".format(pred_str).encode())

    if dest_f is None:
        logging.info(evaluator.get_report(include_error=dest_f is not None))
    else:
        # dest_f.write(evaluator.get_report(include_error=dest_f is not None))
        dest_f.write(
            evaluator.get_report(include_error=dest_f is not None).encode())
    logger.info("Generation Done")
Exemplo n.º 6
0
def gen_with_vae(model, data_feed, config, num_batch=1, dest_f=None):
    model.eval()
    old_batch_size = config.batch_size
    if num_batch != None:
        config.batch_size = 3

    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, shuffle=False, verbose=False)

    logger.info("Generation: {} batches".format(
        data_feed.num_batch if num_batch is None else num_batch))
    print_cnt = 0
    sample_n = 5

    def write(msg):
        if dest_f is None:
            logger.info(msg)
        else:
            dest_f.write(msg + '\n')

    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break

        ctx = batch.get('contexts')
        ctx_size = ctx.shape[1]
        sample_outputs, _ = model(batch,
                                  mode=GEN,
                                  gen_type="sample",
                                  sample_n=sample_n)
        greedy_outputs, labels = model(batch,
                                       mode=GEN,
                                       gen_type="greedy",
                                       sample_n=sample_n)

        # move from GPU to CPU
        labels = labels.cpu()
        sample_labels = [
            t.cpu().data.numpy()
            for t in sample_outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        greedy_labels = [
            t.cpu().data.numpy()
            for t in greedy_outputs[DecoderRNN.KEY_SEQUENCE]
        ]

        sample_labels = np.array(sample_labels,
                                 dtype=int).squeeze(-1).swapaxes(0, 1)
        greedy_labels = np.array(greedy_labels,
                                 dtype=int).squeeze(-1).swapaxes(0, 1)
        true_labels = labels.data.numpy()

        for b_id in range(true_labels.shape[0]):
            ctx_str = []
            for i in range(ctx_size):
                temp, _ = engine.get_sent(model, de_tknize, ctx[:, i, :], b_id)
                if temp:
                    ctx_str.append(temp)
            ctx_str = '<t>'.join(ctx_str)

            true_str, _ = engine.get_sent(model, de_tknize, true_labels, b_id)
            print_cnt += 1
            write("Source: {}".format(ctx_str))
            write("Target: {}".format(true_str))
            for n_id in range(sample_n):
                pred_str, attn = engine.get_sent(
                    model, de_tknize, greedy_labels,
                    b_id + config.batch_size * n_id)
                write("Sample Z: {}".format(pred_str))
            for n_id in range(sample_n):
                pred_str, attn = engine.get_sent(
                    model, de_tknize, sample_labels,
                    b_id + config.batch_size * n_id)
                write("Sample W: {}".format(pred_str))
            write('\n')
    config.batch_size = old_batch_size

    logger.info("Generation Done\n")
Exemplo n.º 7
0
def find_mi(model, data_feed, config):
    model.eval()
    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, verbose=False, shuffle=False)
    logger.info("Find MI for: {} batches".format(data_feed.num_batch))

    all_codes = []
    all_metas = []
    meta_keys = set()
    def write(msg):
        logger.info(msg)

    def code2id(code, base):
        idx = 0
        for c_id, c in enumerate(code):
            idx += int(c) * np.power(base, c_id)
        return idx

    while True:
        batch = data_feed.next_batch()
        if batch is None:
            break
        results = model(batch, mode=TEACH_FORCE, return_latent=True)

        labels = batch.outputs
        metas = batch.metas
        for key in metas[0].keys():
            meta_keys.add(key)
        log_qy = results.log_qy.view(-1, config.y_size, config.k)
        qy = torch.exp(log_qy)
        qy = qy.cpu().data.numpy()
        y_ids = results.y_ids.cpu().data.numpy()
        for b_id in range(labels.shape[0]):
            true_str, _ = engine.get_sent(model, de_tknize, labels, b_id)
            code = []
            for y_id in range(config.y_size):
                for k_id in range(config.k):
                    if qy[b_id, y_id, k_id] == np.max(qy[b_id, y_id]):
                        code.append(str(k_id))
                        break
            #all_codes.append(code)
            all_codes.append(y_ids[b_id])
            all_metas.append(metas[b_id])

    vec_codes = np.array(all_codes).transpose(0, 1)
    vec_idxes = [code2id(c, config.k) for c in vec_codes]
    vec_vocabs = list(set(vec_idxes))
    vec_idxes = [vec_vocabs.index(v) for v in vec_idxes]

    for key in meta_keys:
        # get all meta about this key
        meta_vals = []
        for m in all_metas:
            if type(m[key]) is list:
                meta_vals.append(" ".join(map(str, m[key])))
            elif type(m[key]) is dict:
                break
            else:
                meta_vals.append(m[key])
        if not meta_vals:
            continue
        meta_vocab = list(set(meta_vals))
        meta_vals = [meta_vocab.index(v) for v in meta_vals]

        mi = metrics.homogeneity_score(meta_vals, vec_idxes)
        write("{} mi with ID is {}".format(key, mi))

        # individual dimension
        for y_id in range(config.y_size):
            mi = metrics.homogeneity_score(meta_vals, vec_codes[:, y_id])
            write("{} mi with dim {} is {}".format(key, y_id, mi))
Exemplo n.º 8
0
def latent_cluster(model, data_feed, config, cluster_name_id=None, action_count=0, num_batch=1, max_samples=5):
    if np.power(config.k, config.y_size) > 2000:
        logger.info("Skip latent cluster too many states")
        return
    model.eval()
    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, verbose=False, shuffle=False)
    logger.info("Find cluster for: {} batches".format(data_feed.num_batch
                                                if num_batch is None
                                                else num_batch))
    all_clusters = defaultdict(list)
    cond_y_matrix = np.zeros((config.k, config.k))
    index_cluster_id = defaultdict(list)

    def write(msg):
        logger.info(msg)

    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        results = model(batch, mode=TEACH_FORCE, return_latent=True)

        labels = batch.outputs
        metas = batch.metas
        raw_index = batch.index
        log_qy = results.log_qy.view(-1, config.y_size, config.k)
        qy = torch.exp(log_qy)
        qy = qy.cpu().data.numpy()

        y_ids = results.y_ids.cpu().data.numpy()
        for b_id in range(labels.shape[0]):
            true_str, _ = engine.get_sent(model, de_tknize, labels, b_id)
            cond_y_matrix[y_ids[b_id]] += 1
            code = []
            for y_id in range(config.y_size):
                for k_id in range(config.k):
                    if qy[b_id, y_id, k_id] == np.max(qy[b_id, y_id]):
                        code.append(str(k_id))
                        break
            code = '-'.join(code)
            index_cluster_id[str(raw_index[b_id])] = code
            all_clusters[code].append((true_str, metas[b_id]))

    # show clusters
    keys = all_clusters.keys()
    keys = sorted(keys)
    logger.info("Find {} clusters".format(len(keys)))

    selected_clusters = []
    if cluster_name_id is None:
        cluster_name_id = defaultdict(int)
    for symbol in keys:
        sents = all_clusters[symbol]
        if len(sents) < 1:
            write("Skip tiny cluster with {} utts - {}".format(len(sents), symbol))
            continue
        if symbol not in cluster_name_id:
            cluster_name_id[symbol] = action_count
            action_count += 1

        write("Symbol {}".format(symbol))
        if len(sents) < max_samples:
            print("Find small cluster with {} utts".format(len(sents)))
            subset_ids = range(len(sents))
            np.random.shuffle(subset_ids)
        else:
            subset_ids = np.random.choice(range(len(sents)), max_samples, replace=False)
        for s_id in subset_ids[0:5]:
            write(sents[s_id][0])
        write("")
        selected_clusters.append({'code': symbol, 'meaning': '',
                                  'examples': [sents[idx][0] for idx in subset_ids]})
    logger.info("Find {} actions".format(action_count))
    for sent in index_cluster_id.keys():
        cluster_name = index_cluster_id[sent]
        if cluster_name in cluster_name_id:
            index_cluster_id[sent]=[cluster_name, cluster_name_id[cluster_name]]

    return selected_clusters, index_cluster_id, cluster_name_id, action_count
Exemplo n.º 9
0
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None):
    model.eval()
    old_batch_size = config.batch_size

    if num_batch != None:
        config.batch_size = 5

    de_tknize = utils.get_dekenize()
    data_feed.epoch_init(config, shuffle=False, verbose=False)
    config.batch_size = old_batch_size

    evaluator.initialize()
    logger.info("Generation: {} batches".format(
        data_feed.num_batch if num_batch is None else num_batch))

    def write(msg):
        if dest_f is None:
            logger.info(msg)
        else:
            # dest_f.write(msg+'\n')
            dest_f.write(str(msg + '\n').encode())

    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type)
        prev_outputs, next_outputs = outputs
        prev_labels, next_labels = labels
        cur_labels = batch.get('outputs')

        prev_labels = prev_labels.cpu().data.numpy()
        next_labels = next_labels.cpu().data.numpy()

        prev_pred = [
            t.cpu().data.numpy() for t in prev_outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        prev_pred = np.array(prev_pred, dtype=int).squeeze(-1).swapaxes(0, 1)

        next_pred = [
            t.cpu().data.numpy() for t in next_outputs[DecoderRNN.KEY_SEQUENCE]
        ]
        next_pred = np.array(next_pred, dtype=int).squeeze(-1).swapaxes(0, 1)

        for b_id in range(cur_labels.shape[0]):
            ctx_str, _ = engine.get_sent(model, de_tknize, cur_labels, b_id)
            prev_true_str, _ = engine.get_sent(model, de_tknize, prev_labels,
                                               b_id)
            next_true_str, _ = engine.get_sent(model, de_tknize, next_labels,
                                               b_id)

            pred_prev_str, _ = engine.get_sent(model, de_tknize, prev_pred,
                                               b_id)
            pred_next_str, _ = engine.get_sent(model, de_tknize, next_pred,
                                               b_id)

            evaluator.add_example(prev_true_str, pred_prev_str)
            evaluator.add_example(next_true_str, pred_next_str)

            write("Response: {}".format(ctx_str))
            write("Prev Target: {}".format(prev_true_str))
            write("Prev Predict: {}".format(pred_prev_str))
            write("Next Target: {}".format(next_true_str))
            write("Next Predict: {}\n".format(pred_next_str))

    if dest_f is None:
        logging.info(evaluator.get_report(include_error=dest_f is not None))
    else:
        # dest_f.write(evaluator.get_report(include_error=dest_f is not None))
        dest_f.write(
            evaluator.get_report(include_error=dest_f is not None).encode())
    logger.info("Generation Done")
Exemplo n.º 10
0
def generate(model, data_feed, config, evaluator, num_batch=1, dest_f=None):
    model.eval()
    de_tknize = get_dekenize()

    def write(msg):
        if msg is None or msg == '':
            return
        if dest_f is None:
            logger.info(msg)
        else:
            dest_f.write(msg + '\n')

    data_feed.epoch_init(config, shuffle=num_batch is not None, verbose=False)
    evaluator.initialize()
    logger.info("Generation: {} batches".format(data_feed.num_batch
                                                if num_batch is None
                                                else num_batch))
    while True:
        batch = data_feed.next_batch()
        if batch is None or (num_batch is not None
                             and data_feed.ptr > num_batch):
            break
        outputs, labels = model(batch, mode=GEN, gen_type=config.gen_type)

        # move from GPU to CPU
        labels = labels.cpu()
        pred_labels = [t.cpu().data.numpy() for t in
                       outputs[DecoderRNN.KEY_SEQUENCE]]
        pred_labels = np.array(pred_labels, dtype=int).squeeze(-1).swapaxes(0,1)
        true_labels = labels.data.numpy()
        # get attention if possible
        if config.use_attn or config.use_ptr:
            pred_attns = [t.cpu().data.numpy() for t in outputs[DecoderRNN.KEY_ATTN_SCORE]]
            pred_attns = np.array(pred_attns, dtype=float).squeeze(2).swapaxes(0,1)
        else:
            pred_attns = None

        # get last 1 context
        ctx = batch.get('contexts')
        ctx_len = batch.get('context_lens')
        domains = batch.domains

        # logger.info the batch in String.
        for b_id in range(pred_labels.shape[0]):
            pred_str, attn = get_sent(model, de_tknize, pred_labels, b_id, attn=pred_attns)
            true_str, _ = get_sent(model, de_tknize, true_labels, b_id)
            prev_ctx = ""
            if ctx is not None:
                ctx_str, _ = get_sent(model, de_tknize, ctx[:, ctx_len[b_id]-1, :], b_id)
                prev_ctx = "Source: {}".format(ctx_str)

            domain = domains[b_id]
            evaluator.add_example(true_str, pred_str, domain)
            if num_batch is None or num_batch <= 2:
                write(prev_ctx)
                write("{}:: True: {} ||| Pred: {}".format(domain, true_str, pred_str))
                if attn:
                    write("[[{}]]".format(attn))

    write(evaluator.get_report(include_error=dest_f is not None))
    logger.info("Generation Done")