def run_nucleus_sampling(beam_search_model: TGEN_Model, das, cfg, max_pred_len=60):
    da_embedder = beam_search_model.da_embedder
    text_embedder = beam_search_model.text_embedder

    results = []
    final_beams = []

    start = time()
    print("Start generating")
    for i, da_emb in tqdm(list(enumerate(da_embedder.get_embeddings(das)))[len(final_beams):]):
        inf_enc_out = beam_search_model.encoder_model.predict(np.array([da_emb]))
        enc_outs = inf_enc_out[0]
        enc_last_state = inf_enc_out[1:]
        paths = [(log(1.0), text_embedder.start_emb, enc_last_state)]

        end_tokens = beam_search_model.text_embedder.end_embs
        for step in range(max_pred_len):
            paths, _ = beam_search_model.beam_search_exapand(paths, enc_outs, 1, beam_search=False, top_p=cfg['top_p'])
            if all([p[1][-1] in end_tokens for p in paths]):
                break

        best_path = paths[0]
        pred_toks = text_embedder.reverse_embedding(best_path[1])
        results.append(pred_toks)
    print("*** Time to generate text =", time() - start)

    return results
def get_scores_ordered_beam(cfg,
                            da_embedder,
                            text_embedder,
                            beam_save_path=None):
    print("Loading Training Data")
    beam_size = cfg["beam_size"]
    train_texts, train_das = get_multi_reference_training_variables()
    if beam_save_path is None:
        beam_save_path = TRAIN_BEAM_SAVE_FORMAT.format(
            beam_size, cfg["tgen_seq2seq_config"].split('.')[0].split('/')[-1])
    if not os.path.exists(beam_save_path):
        models = TGEN_Model(da_embedder, text_embedder,
                            cfg["tgen_seq2seq_config"])
        models.load_models()
        print("Creating test final beams")
        scorer = get_score_function('identity', cfg, models, None, beam_size)
        run_beam_search_with_rescorer(scorer,
                                      models,
                                      train_das,
                                      beam_size,
                                      cfg,
                                      only_rerank_final=True,
                                      save_final_beam_path=beam_save_path)
    bleu = BLEUScore()
    final_beam = pickle.load(open(beam_save_path, "rb"))
    text_seqs = []
    da_seqs = []
    scores = []
    log_probs = []
    with_ref_train_flag = cfg["with_refs_train"]
    num_ranks = cfg["num_ranks"]
    cut_offs = get_section_cutoffs(num_ranks)
    regression_vals = get_regression_vals(num_ranks, with_ref_train_flag)
    if cfg["output_type"] != 'pair':
        print("Cut off values:", cut_offs)
        print("Regression vals:", regression_vals)

    only_top = cfg.get("only_top", False)
    only_bottom = cfg.get("only_bottom", False)
    merge_middles = cfg["merge_middle_sections"]
    if only_top:
        print("Only using top value")
    if merge_middles and only_top:
        print("Ignoring only top since have merge_middle_sections set")
    training_vals = list(zip(final_beam, train_texts, train_das))
    training_vals = training_vals[:cfg.get("use_size", len(training_vals))]
    for beam, real_texts, da in tqdm(training_vals):
        beam_scores = []
        if with_ref_train_flag:
            # I am not sure how to do log probs?
            text_seqs.extend(real_texts)
            da_seqs.extend([da for _ in real_texts])
            scores.extend([0 for _ in real_texts])

        for i, path in enumerate(beam):
            bleu.reset()
            hyp = [
                x for x in text_embedder.reverse_embedding(path[1])
                if x not in [START_TOK, END_TOK, PAD_TOK]
            ]
            bleu.append(
                hyp, [x for x in real_texts if x not in [START_TOK, END_TOK]])
            beam_scores.append((bleu.score(), hyp, path))

            # log_probs.append(i)

        for i, (score, hyp,
                path) in enumerate(sorted(beam_scores, reverse=True)):
            text_seqs.append([START_TOK] + hyp + [END_TOK])
            da_seqs.append(da)
            if cfg["output_type"] in ['bleu', 'pair']:
                scores.append(score)
            elif cfg["output_type"] == 'order_discrete':
                scores.append(to_categorical([i], num_classes=beam_size))
            elif cfg["output_type"] in [
                    'regression_ranker', 'regression_reranker_relative'
            ]:
                scores.append(i / (beam_size - 1))
            elif cfg["output_type"] in [
                    'regression_sections', 'binary_classif'
            ]:
                val = (i / (beam_size - 1))
                regression_val = get_section_value(val, cut_offs,
                                                   regression_vals,
                                                   merge_middles, only_top,
                                                   only_bottom)
                scores.append(
                    regression_val
                )  # converts range from [0,1] to [-1,1] (which has mean of 0)
            else:
                raise ValueError("Unknown output type")

            log_probs.append([path[0]])

    text_seqs = np.array(
        text_embedder.get_embeddings(text_seqs, pad_from_end=False))
    da_seqs = np.array(da_embedder.get_embeddings(da_seqs))

    if cfg["output_type"] in [
            'regression_ranker', 'bleu', 'regression_reranker_relative',
            'pair', 'regression_sections', 'binary_classif'
    ]:
        # print("SCORES: ", Counter(scores))
        scores = np.array(scores).reshape((-1, 1))
    elif cfg["output_type"] == 'order_discrete':
        scores = np.array(scores).reshape((-1, beam_size))

    log_probs = np.array(log_probs)
    return text_seqs, da_seqs, scores, log_probs
                       cfg["valid_size"], cfg.get("min_training_passes", 5))
    else:
        reranker.train(text_seqs, da_seqs, scores, log_probs, cfg["epoch"],
                       cfg["valid_size"], cfg["num_ranks"],
                       cfg.get("only_bottom",
                               False), cfg.get("only_top", False),
                       cfg.get("min_training_passes", 5))

if cfg["show_reranker_post_training_stats"]:
    test_das = get_test_das()
    test_texts = get_true_sents()
    final_beam_path = TEST_BEAM_SAVE_FORMAT.format(10)

    if not os.path.exists(final_beam_path):
        print("Creating final beams file")
        models = TGEN_Model(da_embedder, text_embedder,
                            cfg['tgen_seq2seq_config'])
        models.load_models()
        scorer = get_score_function('identity', cfg, models, None, 10)
        run_beam_search_with_rescorer(scorer,
                                      models,
                                      test_das,
                                      10,
                                      only_rerank_final=True,
                                      save_final_beam_path=final_beam_path)

    bleu = BLEUScore()
    test_da_embs = da_embedder.get_embeddings(test_das)
    final_beam = pickle.load(open(final_beam_path, 'rb+'))
    all_reals = []
    all_preds = []
    for da_emb, beam, true in zip(test_da_embs, final_beam, test_texts):
Exemplo n.º 4
0
from base_models import TGEN_Model
from embedding_extractor import TokEmbeddingSeq2SeqExtractor, DAEmbeddingSeq2SeqExtractor
from utils import get_training_variables, get_test_das
import numpy as np

texts, das = get_training_variables()
text_embedder = TokEmbeddingSeq2SeqExtractor(texts)
da_embedder = DAEmbeddingSeq2SeqExtractor(das)

das_test = get_test_das()

beam_path = 'output_files/saved_beams/16_vanilla_2_3.pickle'
beam_path_2 = 'output_files/saved_beams/t2p_vanilla_3.pickle'
beams = pickle.load(file=open(beam_path, 'rb'))
beams_2 = pickle.load(file=open(beam_path_2, 'rb'))
models = TGEN_Model(da_embedder, text_embedder, 'new_configs/model_configs/seq2seq_all_data.yaml')
models.load_models()

da_emb = da_embedder.get_embeddings([das_test[0]])[0]
inf_enc_out = models.encoder_model.predict(np.array([da_emb]))
enc_outs = inf_enc_out[0]
enc_last_state = inf_enc_out[1:]

print(beams[0][0][1] == beams_2[0][0][1])
print(sum(models.get_prob_sequence(enc_outs, beams[0][0][1], enc_last_state)))
print(beams[0][0][0], ' '.join(text_embedder.reverse_embedding(beams[0][0][1])))
print()
print(sum(models.get_prob_sequence(enc_outs, beams_2[0][0][1], enc_last_state)))
print(beams_2[0][0][0], ' '.join(text_embedder.reverse_embedding(beams_2[0][0][1])))
print("********************************")
print(sum(models.get_prob_sequence(enc_outs, beams[0][1][1], enc_last_state)))
if cfg_path is None:
    filenames = os.listdir(CONFIGS_MODEL_DIR)
    filepaths = [
        os.path.join(CONFIGS_MODEL_DIR, filename) for filename in filenames
    ]
    mod_times = [(os.path.getmtime(x), i) for i, x in enumerate(filepaths)]
    cfg_path = filepaths[max(mod_times)[1]]

cfg = yaml.safe_load(open(cfg_path, 'r'))
texts, das = get_training_variables()
text_embedder = TokEmbeddingSeq2SeqExtractor(texts)
da_embedder = DAEmbeddingSeq2SeqExtractor(das)

texts_mr, da_mr = get_multi_reference_training_variables()
# train_text = np.array(text_embedder.get_embeddings(texts, pad_from_end=True) + [text_embedder.empty_embedding])
# da_embs = da_embedder.get_embeddings(das) + [da_embedder.empty_embedding]

seq2seq = TGEN_Model(da_embedder, text_embedder, cfg_path)
seq2seq.load_models()
seq2seq.full_model.summary()
if "use_prop" in cfg:
    da_mr = da_mr[:int(len(da_mr) * cfg['use_prop'])]
    texts_mr = texts_mr[:int(len(da_mr) * cfg['use_prop'])]
seq2seq.train(da_seq=da_mr,
              text_seq=texts_mr,
              n_epochs=cfg["epoch"],
              valid_size=cfg["valid_size"],
              early_stop_point=cfg["min_epoch"],
              minimum_stop_point=20,
              multi_ref=True)