def rouge_l(eval_sentences, eval_len, ref_sentences, ref_len, vocab, use_bpe=False): """Rouge-L. Args: eval_sentences: prediction to be evaluated. eval_len: lengths of the predictions. ref_sentences: reference sentences. ref_len: lengths of the references. vocab: vocabulary. use_bpe: to use BPE or not. Returns: Rouge-L """ f1_scores = [] for e, el, r, rl in zip(eval_sentences, eval_len, ref_sentences, ref_len): e = id2text(e[:el], vocab=vocab, use_bpe=use_bpe).split() r = r[:rl] r = [x for x in r if x not in SPECIAL_TOKENS] lcs = _len_lcs(e, r) f1_scores.append(_f_lcs(lcs, len(r), len(e))) return np.mean(f1_scores, dtype=np.float32)
def rouge_n(eval_sentences, eval_len, ref_sentences, ref_len, n, vocab, use_bpe=False, predict_mode=False): """Rouge N.""" f1_scores = [] for e, el, r, rl in zip(eval_sentences, eval_len, ref_sentences, ref_len): e = id2text(e[:el], vocab=vocab, use_bpe=use_bpe).split() r = r[:rl] e = [x for x in e if x not in SPECIAL_TOKENS] r = [x for x in r if x not in SPECIAL_TOKENS] if n == 1 and predict_mode: tf.logging.info("prediction: %s", " ".join(e)) tf.logging.info("reference: %s", " ".join(r)) eval_ngrams = _get_ngrams(n, e) ref_ngrams = _get_ngrams(n, r) ref_count = len(ref_ngrams) eval_count = len(eval_ngrams) overlapping_ngrams = eval_ngrams.intersection(ref_ngrams) overlapping_count = len(overlapping_ngrams) precision = _safe_divide(overlapping_count, eval_count) recall = _safe_divide(overlapping_count, ref_count) f1_scores.append( _safe_divide(2 * precision * recall, precision + recall)) return np.mean(f1_scores, dtype=np.float32)
def _do_print(n, sequence, lengths, to_txt): if to_txt: s = sequence[0][:lengths[0]] output = id2text(s, vocab, use_bpe=use_bpe) else: output = " ".join(sequence[0]) if not predict_mode: tf.logging.info("%s: %s", n, output)
def remove_repetitive_trigram(preds, lengths, vocab, hps): """Select from the beam a prediction without repetitive trigrams.""" ret_preds, ret_lengths = [], [] for (pred, length) in zip(preds, lengths): flag = True for i in xrange(hps.beam_width): l = length[Ellipsis, i] p = pred[Ellipsis, i][:l] tokens = data.id2text(p, vocab=vocab, use_bpe=hps.use_bpe).split() flag = repetitive_ngrams(3, tokens) or bad_tok(tokens, vocab) if not flag: ret_preds.append(pred[Ellipsis, i]) ret_lengths.append(length[Ellipsis, i]) break if flag: ret_preds.append(pred[Ellipsis, 0]) ret_lengths.append(length[Ellipsis, 0]) predictions = np.int32(np.stack(ret_preds)), np.int32(np.stack(ret_lengths)) return predictions
def main(_): tf.logging.set_verbosity(tf.logging.INFO) if FLAGS.model == "seq2seq": assert FLAGS.rnn_cell == "lstm" assert FLAGS.att_type != "hyper" if FLAGS.model == "hypernet" and FLAGS.rank != FLAGS.decoder_dim: print("WARNING: recommended rank value: decoder_dim.") if FLAGS.att_neighbor: assert FLAGS.neighbor_dim == FLAGS.encoder_dim or FLAGS.att_type == "my" if FLAGS.use_copy or FLAGS.att_neighbor: assert FLAGS.att_type == "my" # These numbers are the target vocabulary sizes of the datasets. # It allows for using different vocabularies for source and targets, # following the implementation in Open-NMT. # I will later put these into command line arguments. if FLAGS.use_bpe: if FLAGS.dataset == "nyt": output_size = 10013 elif FLAGS.dataset == "giga": output_size = 24654 elif FLAGS.dataset == "cnnd": output_size = 10232 else: if FLAGS.dataset == "nyt": output_size = 68885 elif FLAGS.dataset == "giga": output_size = 107389 elif FLAGS.dataset == "cnnd": output_size = 21000 vocab = data.Vocab(FLAGS.vocab_path, FLAGS.vocab_size, FLAGS.dataset) hps = tf.contrib.training.HParams( sample_neighbor=FLAGS.sample_neighbor, use_cluster=FLAGS.use_cluster, binary_neighbor=FLAGS.binary_neighbor, att_neighbor=FLAGS.att_neighbor, encode_neighbor=FLAGS.encode_neighbor, sum_neighbor=FLAGS.sum_neighbor, dataset=FLAGS.dataset, rnn_cell=FLAGS.rnn_cell, output_size=output_size + vocab.offset, train_path=FLAGS.train_path, dev_path=FLAGS.dev_path, tie_embedding=FLAGS.tie_embedding, use_bpe=FLAGS.use_bpe, use_copy=FLAGS.use_copy, reuse_attention=FLAGS.reuse_attention, use_bridge=FLAGS.use_bridge, use_residual=FLAGS.use_residual, att_type=FLAGS.att_type, random_neighbor=FLAGS.random_neighbor, num_neighbors=FLAGS.num_neighbors, model=FLAGS.model, trainer=FLAGS.trainer, learning_rate=FLAGS.learning_rate, lr_schedule=FLAGS.lr_schedule, total_steps=FLAGS.total_steps, emb_dim=FLAGS.emb_dim, binary_dim=FLAGS.binary_dim, neighbor_dim=FLAGS.neighbor_dim, drop=FLAGS.drop, emb_drop=FLAGS.emb_drop, out_drop=FLAGS.out_drop, encoder_drop=FLAGS.encoder_drop, decoder_drop=FLAGS.decoder_drop, weight_decay=FLAGS.weight_decay, encoder_dim=FLAGS.encoder_dim, num_encoder_layers=FLAGS.num_encoder_layers, decoder_dim=FLAGS.decoder_dim, num_decoder_layers=FLAGS.num_decoder_layers, num_mlp_layers=FLAGS.num_mlp_layers, rank=FLAGS.rank, sigma_norm=FLAGS.sigma_norm, batch_size=FLAGS.batch_size, sampling_probability=FLAGS.sampling_probability, beam_width=FLAGS.beam_width, max_enc_steps=FLAGS.max_enc_steps, max_dec_steps=FLAGS.max_dec_steps, vocab_size=FLAGS.vocab_size, max_grad_norm=FLAGS.max_grad_norm, length_norm=FLAGS.length_norm, cp=FLAGS.coverage_penalty, predict_mode=FLAGS.predict_mode) run_config = tf.estimator.RunConfig(model_dir=FLAGS.model_dir) vocab = data.Vocab(FLAGS.vocab_path, FLAGS.vocab_size, FLAGS.dataset) eval_input_fn = partial(data.input_function, is_train=False, vocab=vocab, hps=hps) estimator = tf.estimator.Estimator(model_fn=partial( model_function.model_function, vocab=vocab, hps=hps), config=run_config, model_dir=run_config.model_dir) results = estimator.predict(input_fn=eval_input_fn) with tf.gfile.Open("%s/prediction" % FLAGS.model_dir, "w") as fout: for result in results: outputs, _ = result["outputs"], result["lengths"] prediction = data.id2text(outputs, vocab, use_bpe=FLAGS.use_bpe) fout.write(prediction + "\n")