def _bleu(ref_file, trans_file, mode="brief"): """Compute BLEU scores and handling BPE.""" max_order = 4 smooth = False ref_files = [ref_file] reference_text = [] for reference_filename in ref_files: with codecs.getreader("utf-8")(tf.gfile.GFile(reference_filename, "rb")) as fh: reference_text.append(fh.readlines()) per_segment_references = [] role_tokens = [] for references in zip(*reference_text): reference_list = [] for reference in references: reference, role = process_dialogue_infer(reference.rstrip(), get_role_token=True) reference_list.append(reference.split(" ")) per_segment_references.append(reference_list) role_tokens.append(role) translations = [] with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh: for line in fh: translations.append(line.rstrip().split(" ")) results = {} bleu_score, _, _, _, _, _ = bleu.compute_bleu(per_segment_references, translations, max_order, smooth) results["all"] = 100 * bleu_score if mode == "brief": return results["all"] for role in ROLE_TOKENS: _sub_ref_texts = [] _sub_trans = [] for _r, _t, _role in zip(per_segment_references, translations, role_tokens): if _role == role: _sub_ref_texts.append(_r) _sub_trans.append(_t) bleu_score, _, _, _, _, _ = bleu.compute_bleu(_sub_ref_texts, _sub_trans, max_order, smooth) results[role] = 100 * bleu_score return results
def _bleu(ref_file, trans_file): """Compute BLEU scores and handling BPE.""" max_order = 4 smooth = False ref_files = [ref_file] reference_text = [] for reference_filename in ref_files: with codecs.getreader("utf-8")(tf.gfile.GFile(reference_filename, "rb")) as fh: reference_text.append(fh.readlines()) per_segment_references = [] for references in zip(*reference_text): reference_list = [] for reference in references: reference = process_dialogue_infer(reference) reference_list.append(reference.split(" ")) per_segment_references.append(reference_list) translations = [] with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh: for line in fh: translations.append(line.split(" ")) bleu_score, _, _, _, _, _ = bleu.compute_bleu(per_segment_references, translations, max_order, smooth) return 100 * bleu_score
def blue(trans,refs,subword_option=None): ''' 要求trans,ref必须是list,长度一样,一对一 :param trans: list(str),shape(N,) :param ref:list(list(str)),shape(R,N),R是参考的数目 :return: ''' if isinstance(refs,str): refs=load_file(refs,subword_option) transs=[_clean(line.strip(),subword_option).split(' ') for line in trans] refss=[] for reference in refs: refss.append([reference.strip().split(' ')]) blue_score,_,_,_,_,_=compute_bleu(refss,transs,max_order=4,smooth=False) return 100*blue_score
def calculate_metrics_results(results: dict): for data in results.keys(): references = [] translations = [] references_rouge = [] translations_rouge = [] wert = 0.0 meteort = 0.0 for video in results[data].keys(): translation = results[data][video]["prediction_sentence"] if '</s>' in translation: translation.remove('</s>') translation = " ".join(translation) reference = results[data][video]["target_sentence"] if '</s>' in reference: reference.remove('</s>') reference = " ".join(reference) wert += jiwer.wer(truth=reference, hypothesis=translation) meteort += single_meteor_score(reference, translation) translations.append(translation.split(" ")) translations_rouge.append(translation) references.append([reference.split(" ")]) references_rouge.append(reference) print(len(references)) rouge_score_map = rouge.rouge(translations_rouge, references_rouge) print(data + ' rouge: ' + str(100 * rouge_score_map["rouge_l/f_score"])) print(data + ' WER: ' + str((wert / len(references)) * 100)) print(data + ' Meteor: ' + str((meteort / len(references)) * 100)) for max_ in range(1, 5): bleu_score, _, _, _, _, _ = bleu.compute_bleu(references, translations, max_order=max_) print(data + ' bleu: ' + str(max_) + " " + str(bleu_score * 100))
def train(data_train, data_val, data_test, model, loss_function, val_tgt_sentences, test_tgt_sentences, translator, start_epoch, ctx, tb_sw=None): """Training function. """ trainer = gluon.Trainer(model.collect_params(), FLAGS.optimizer, {'learning_rate': FLAGS.lr}) train_data_loader, val_data_loader, test_data_loader = get_dataloaders( data_train, data_val, data_test) best_valid_bleu = 0.0 for epoch_id in range(start_epoch, FLAGS.epochs): log_avg_loss = 0 log_wc = 0 log_start_time = time.time() for batch_id, (src_seq, tgt_seq, src_valid_length, tgt_valid_length) in enumerate(train_data_loader): # if batch_id == len(train_data_loader)-1: # break # errors on last batch, jump out for now # put on the right ctx src_seq = src_seq.as_in_context(ctx) tgt_seq = tgt_seq.as_in_context(ctx) src_valid_length = src_valid_length.as_in_context(ctx) tgt_valid_length = tgt_valid_length.as_in_context(ctx) # calc the outs, the loss and back pass with mx.autograd.record(): out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length, tgt_valid_length - 1) loss = loss_function(out, tgt_seq[:, 1:], tgt_valid_length - 1).mean() loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length - 1).mean() loss.backward() # step the trainer and add up some losses trainer.step(1) src_wc = src_valid_length.sum().asscalar() tgt_wc = (tgt_valid_length - 1).sum().asscalar() step_loss = loss.asscalar() log_avg_loss += step_loss log_wc += src_wc + tgt_wc # log this batches statistics if tb_sw: tb_sw.add_scalar(tag='Training_loss', scalar_value=step_loss, global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_scalar(tag='Training_ppl', scalar_value=np.exp(step_loss), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) if (batch_id + 1) % FLAGS.log_interval == 0: wps = log_wc / (time.time() - log_start_time) logging.info( '[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f} throughput={:.2f}K wps, wc={:.2f}K' .format(epoch_id, batch_id + 1, len(train_data_loader), log_avg_loss / FLAGS.log_interval, np.exp(log_avg_loss / FLAGS.log_interval), wps / 1000, log_wc / 1000)) log_start_time = time.time() log_avg_loss = 0 log_wc = 0 # log embeddings if tb_sw: embs = mx.nd.array(list(range(len( data_train.vocab)))).as_in_context(ctx) embs = model.tgt_embed(embs) labs = data_train.vocab.idx_to_token tb_sw.add_embedding(mat=embs.asnumpy(), metadata=labs, global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) # calculate validation and loss stats at end of epoch valid_loss, valid_translation_out = evaluate(val_data_loader, model, loss_function, translator, data_train, ctx) valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out) # valid_met_score = meteor_score([[' '.join(sent)] for sent in val_tgt_sentences], [' '.join(sent) for sent in valid_translation_out]) str_ = '[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'.format( epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100) nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics( [[' '.join(sent) for sent in val_tgt_sentences]], [' '.join(sent) for sent in valid_translation_out]) for k, v in metrics_dict.items(): str_ += ', valid ' + k + '={:.4f}'.format(float(v)) logging.info(str_) # log the validation and loss stats if tb_sw: tb_sw.add_scalar(tag='Validation_loss', scalar_value=valid_loss, global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_scalar(tag='Validation_ppl', scalar_value=np.exp(valid_loss), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_scalar(tag='Validation_bleu', scalar_value=valid_bleu_score * 100, global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_text(tag='Validation Caps', text_string=get_comp_str(val_tgt_sentences, valid_translation_out), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) for k, v in metrics_dict.items(): tb_sw.add_scalar(tag='Validation_' + k, scalar_value=float(v), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) # also calculate the test stats test_loss, test_translation_out = evaluate(test_data_loader, model, loss_function, translator, data_train, ctx) test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out) # test_met_score = meteor_score([test_tgt_sentences], test_translation_out) str_ = '[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'.format( epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100) nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics( [[' '.join(sent) for sent in test_tgt_sentences]], [' '.join(sent) for sent in test_translation_out]) for k, v in metrics_dict.items(): str_ += ', test ' + k + '={:.4f}'.format(float(v)) logging.info(str_) # and log the test stats if tb_sw: tb_sw.add_scalar(tag='Test_loss', scalar_value=test_loss, global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_scalar(tag='Test_ppl', scalar_value=np.exp(test_loss), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_scalar(tag='Test_bleu', scalar_value=test_bleu_score * 100, global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) tb_sw.add_text(tag='Test Caps', text_string=get_comp_str(test_tgt_sentences, test_translation_out), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) for k, v in metrics_dict.items(): tb_sw.add_scalar(tag='Test_' + k, scalar_value=float(v), global_step=(epoch_id * len(data_train) + batch_id * FLAGS.batch_size)) # write out the validation and test sentences to files write_sentences( valid_translation_out, os.path.join('models', 'captioning', FLAGS.model_id, 'epoch{:d}_valid_out.txt').format(epoch_id)) write_sentences( test_translation_out, os.path.join('models', 'captioning', FLAGS.model_id, 'epoch{:d}_test_out.txt').format(epoch_id)) # save the model params if best if valid_bleu_score > best_valid_bleu: best_valid_bleu = valid_bleu_score save_path = os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params') logging.info('Save best parameters to {}'.format(save_path)) model.save_parameters(save_path) if epoch_id + 1 >= (FLAGS.epochs * 2) // 3: new_lr = trainer.learning_rate * FLAGS.lr_update_factor logging.info('Learning rate change to {}'.format(new_lr)) trainer.set_learning_rate(new_lr) model.save_parameters( os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, "{:04d}.params".format(epoch_id))) # load and evaluate the best model if os.path.exists( os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params')): model.load_parameters( os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'valid_best.params')) valid_loss, valid_translation_out = evaluate(val_data_loader, model, loss_function, translator, data_train, ctx) valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences], valid_translation_out) str_ = 'Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'.format( epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100) nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics( [[' '.join(sent) for sent in val_tgt_sentences]], [' '.join(sent) for sent in valid_translation_out]) for k, v in metrics_dict.items(): str_ += ', valid ' + k + '={:.4f}'.format(float(v)) logging.info(str_) test_loss, test_translation_out = evaluate(test_data_loader, model, loss_function, translator, data_train, ctx) test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences], test_translation_out) str_ = 'Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'.format( epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100) nlgeval = NLGEval() metrics_dict = nlgeval.compute_metrics( [[' '.join(sent) for sent in test_tgt_sentences]], [' '.join(sent) for sent in test_translation_out]) for k, v in metrics_dict.items(): str_ += ', test ' + k + '={:.4f}'.format(float(v)) logging.info(str_) write_sentences( valid_translation_out, os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'best_valid_out.txt')) write_sentences( test_translation_out, os.path.join('models', 'captioning', 'experiments', FLAGS.model_id, 'best_test_out.txt'))