def _predict(sess, examples: [InputExample]): hypotheses, inputs = [], [] features = [] for example in examples: feature = convert_single_example( ex_index=0, example=example, max_seq_length=config_data.max_seq_length, tokenizer=tokenizer) features.append(feature) for feature in features: feed_dict = { src_input_ids: [feature.src_input_ids], src_segment_ids: [feature.src_segment_ids], tx.global_mode(): tf.estimator.ModeKeys.PREDICT, } fetches = { 'beam_search_ids': beam_search_ids, 'src_input_ids': src_input_ids } fetches_ = sess.run(fetches, feed_dict=feed_dict) hypotheses.extend(h.tolist() for h in fetches_['beam_search_ids']) inputs.extend(h.tolist() for h in fetches_['src_input_ids']) hypotheses = utils.list_strip_eos(hypotheses, eos_token_id) write_token_id_arrays_to_text_file( inputs, os.path.join(model_dir, 'predict-inputs.txt'), tokenizer) write_token_id_arrays_to_text_file( hypotheses, os.path.join(model_dir, 'predict-predictions.txt'), tokenizer)
def _eval_epoch(sess, epoch, mode): if mode == 'eval': eval_data = dev_data elif mode == 'test': eval_data = test_data else: raise ValueError('`mode` should be either "eval" or "test".') references, hypotheses = [], [] bsize = config_data.test_batch_size for i in range(0, len(eval_data), bsize): #print("eval {}/{}".format(i, len(eval_data))) sources, targets = zip(*eval_data[i:i + bsize]) x_block = data_utils.source_pad_concat_convert(sources) feed_dict = { encoder_input: x_block, tx.global_mode(): tf.estimator.ModeKeys.EVAL, } fetches = { 'inferred_ids': inferred_ids, } fetches_ = sess.run(fetches, feed_dict=feed_dict) hypotheses.extend(h.tolist() for h in fetches_['inferred_ids']) references.extend(r.tolist() for r in targets) hypotheses = utils.list_strip_eos(hypotheses, eos_token_id) references = utils.list_strip_eos(references, eos_token_id) if mode == 'eval': # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process fname = os.path.join(FLAGS.model_dir, 'tmp.eval') hypotheses = tx.utils.str_join(hypotheses) references = tx.utils.str_join(references) hyp_fn, ref_fn = tx.utils.write_paired_text(hypotheses, references, fname, mode='s') eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) eval_bleu = 100. * eval_bleu logger.info('epoch: %d, eval_bleu %.4f', epoch, eval_bleu) print('epoch: %d, eval_bleu %.4f' % (epoch, eval_bleu)) if eval_bleu > best_results['score']: logger.info('epoch: %d, best bleu: %.4f', epoch, eval_bleu) best_results['score'] = eval_bleu best_results['epoch'] = epoch model_path = os.path.join(FLAGS.model_dir, 'best-model.ckpt') logger.info('saving model to %s', model_path) print('saving model to %s' % model_path) saver.save(sess, model_path) elif mode == 'test': # For 'test' mode, together with the cmds in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(FLAGS.model_dir, 'test.output') hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([id2w[y] for y in hyp]) rwords.append([id2w[y] for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text(hwords, rwords, fname, mode='s') logger.info('Test output writtn to file: %s', hyp_fn) print('Test output writtn to file: %s' % hyp_fn)
def _eval_epoch(epoch, mode, print_fn=None): if print_fn is None: print_fn = print tqdm_leave = True else: tqdm_leave = False model.eval() eval_data = datasets[mode] eval_iter = tx.data.DataIterator(eval_data) references, hypotheses = [], [] for batch in tqdm.tqdm(eval_iter, ncols=80, leave=tqdm_leave, desc=f"Eval on {mode} set"): predictions = model( encoder_input=batch.source, beam_width=beam_width, ) if beam_width == 1: decoded_ids = predictions[0].sample_id else: decoded_ids = predictions["sample_id"][:, :, 0] hypotheses.extend(h.tolist() for h in decoded_ids) references.extend(r.tolist() for r in batch.target_output) hypotheses = utils.list_strip_eos(hypotheses, vocab.eos_token_id) references = utils.list_strip_eos(references, vocab.eos_token_id) if mode == "valid": # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process fname = os.path.join(args.output_dir, "tmp.eval") hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([str(y) for y in hyp]) rwords.append([str(y) for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_file, ref_file = tx.utils.write_paired_text( hwords, rwords, fname, mode="s", src_fname_suffix="hyp", tgt_fname_suffix="ref", ) eval_bleu = tx.evals.file_bleu(ref_file, hyp_file, case_sensitive=True) logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu) print_fn(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}") if eval_bleu > best_results["score"]: logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu) best_results["score"] = eval_bleu best_results["epoch"] = epoch model_path = os.path.join(args.output_dir, args.output_filename) logger.info("Saving model to %s", model_path) print_fn(f"Saving model to {model_path}") states = { "model": model.state_dict(), "optimizer": optim.state_dict(), "scheduler": scheduler.state_dict(), } torch.save(states, model_path) elif mode == "test": # For 'test' mode, together with the commands in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(args.output_dir, "test.output") hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append(vocab.map_ids_to_tokens_py(hyp)) rwords.append(vocab.map_ids_to_tokens_py(ref)) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_file, ref_file = tx.utils.write_paired_text( hwords, rwords, fname, mode="s", src_fname_suffix="hyp", tgt_fname_suffix="ref", ) logger.info("Test output written to file: %s", hyp_file) print_fn(f"Test output written to file: {hyp_file}")
def _eval_epoch(epoch, mode): torch.cuda.empty_cache() if mode == 'eval': eval_data = dev_data elif mode == 'test': eval_data = test_data else: raise ValueError("`mode` should be either \"eval\" or \"test\".") references, hypotheses = [], [] bsize = config_data.test_batch_size for i in tqdm(range(0, len(eval_data), bsize)): sources, targets = zip(*eval_data[i:i + bsize]) with torch.no_grad(): x_block = data_utils.source_pad_concat_convert( sources, device=device) predictions = model( encoder_input=x_block, is_train_mode=False, beam_width=beam_width) if beam_width == 1: decoded_ids = predictions[0].sample_id else: decoded_ids = predictions["sample_id"][:, :, 0] hypotheses.extend(h.tolist() for h in decoded_ids) references.extend(r.tolist() for r in targets) hypotheses = utils.list_strip_eos(hypotheses, eos_token_id) references = utils.list_strip_eos(references, eos_token_id) if mode == 'eval': # Writes results to files to evaluate BLEU # For 'eval' mode, the BLEU is based on token ids (rather than # text tokens) and serves only as a surrogate metric to monitor # the training process # TODO: Use texar.evals.bleu fname = os.path.join(args.model_dir, 'tmp.eval') hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([str(y) for y in hyp]) rwords.append([str(y) for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text( hwords, rwords, fname, mode='s', src_fname_suffix='hyp', tgt_fname_suffix='ref') eval_bleu = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) eval_bleu = 100. * eval_bleu logger.info("epoch: %d, eval_bleu %.4f", epoch, eval_bleu) print(f"epoch: {epoch:d}, eval_bleu {eval_bleu:.4f}") if eval_bleu > best_results['score']: logger.info("epoch: %d, best bleu: %.4f", epoch, eval_bleu) best_results['score'] = eval_bleu best_results['epoch'] = epoch model_path = os.path.join(args.model_dir, args.model_fn) logger.info("Saving model to %s", model_path) print(f"Saving model to {model_path}") states = { 'model': model.state_dict(), 'optimizer': optim.state_dict(), 'scheduler': scheduler.state_dict(), } torch.save(states, model_path) elif mode == 'test': # For 'test' mode, together with the cmds in README.md, BLEU # is evaluated based on text tokens, which is the standard metric. fname = os.path.join(args.model_dir, 'test.output') hwords, rwords = [], [] for hyp, ref in zip(hypotheses, references): hwords.append([id2w[y] for y in hyp]) rwords.append([id2w[y] for y in ref]) hwords = tx.utils.str_join(hwords) rwords = tx.utils.str_join(rwords) hyp_fn, ref_fn = tx.utils.write_paired_text( hwords, rwords, fname, mode='s', src_fname_suffix='hyp', tgt_fname_suffix='ref') logger.info("Test output written to file: %s", hyp_fn) print(f"Test output written to file: {hyp_fn}")
def _eval_epoch(sess, epoch, mode): print('Starting %s' % mode) if mode is not 'eval' and not 'test': print("Unknown mode!") raise dataset_name = 'eval' if mode is 'eval' else 'test' data_iterator.restart_dataset(sess, dataset_name) references, hypotheses, inputs = [], [], [] while True: try: feed_dict = { data_iterator.handle: data_iterator.get_handle(sess, dataset_name), tx.global_mode(): tf.estimator.ModeKeys.EVAL, } fetches = { 'beam_search_ids': beam_search_ids, 'tgt_labels': tgt_labels, # src_input_ids is not necessary for calculating the metric, but allows us to write it to a file. 'src_input_ids': src_input_ids } fetches_ = sess.run(fetches, feed_dict=feed_dict) hypotheses.extend(h.tolist() for h in fetches_['beam_search_ids']) references.extend(r.tolist() for r in fetches_['tgt_labels']) inputs.extend(h.tolist() for h in fetches_['src_input_ids']) hypotheses = utils.list_strip_eos(hypotheses, eos_token_id) references = utils.list_strip_eos(references, eos_token_id) except tf.errors.OutOfRangeError: break def calculate_scores(): hyp_fn, ref_fn = 'tmp.%s.src' % mode, 'tmp.%s.tgt' % mode write_token_id_arrays_to_text_file(hypotheses, os.path.join(model_dir, hyp_fn), tokenizer) write_token_id_arrays_to_text_file(references, os.path.join(model_dir, ref_fn), tokenizer) hyp_fn, ref_fn = os.path.join(model_dir, hyp_fn), os.path.join( model_dir, ref_fn) files_rouge = FilesRouge(hyp_fn, ref_fn) rouge_scores = files_rouge.get_scores(avg=True) bleu_score = bleu_wrapper(ref_fn, hyp_fn, case_sensitive=True) return rouge_scores, bleu_score if mode == 'eval': try: rouge_scores, bleu_score = calculate_scores() except ValueError: print("Failed to calculate rouge scores!") return print_rouge_scores(rouge_scores) print('epoch: %d, bleu_score %.4f' % (epoch, bleu_score)) if bleu_score > best_results['score']: best_results['score'] = bleu_score best_results['epoch'] = epoch model_path = os.path.join(model_dir, 'best-model.ckpt') print('saving model to %s' % model_path) # Also save the best results in a text file for manual evaluation write_token_id_arrays_to_text_file( inputs, os.path.join(model_dir, 'eval-inputs.txt'), tokenizer) write_token_id_arrays_to_text_file( hypotheses, os.path.join(model_dir, 'eval-predictions.txt'), tokenizer) write_token_id_arrays_to_text_file( references, os.path.join(model_dir, 'eval-targets.txt'), tokenizer) saver.save(sess, model_path) elif mode == 'test': rouge_scores, bleu_score = calculate_scores() print_rouge_scores(rouge_scores) print('bleu_score %.4f' % bleu_score) # Also save the results in a text file for manual evaluation write_token_id_arrays_to_text_file( inputs, os.path.join(model_dir, 'test-inputs.txt'), tokenizer) write_token_id_arrays_to_text_file( hypotheses, os.path.join(model_dir, 'test-predictions.txt'), tokenizer) write_token_id_arrays_to_text_file( references, os.path.join(model_dir, 'test-targets.txt'), tokenizer)