示例#1
0
def _bleu(ref_file, trans_file, mode="brief"):
    """Compute BLEU scores and handling BPE."""
    max_order = 4
    smooth = False

    ref_files = [ref_file]
    reference_text = []
    for reference_filename in ref_files:
        with codecs.getreader("utf-8")(tf.gfile.GFile(reference_filename,
                                                      "rb")) as fh:
            reference_text.append(fh.readlines())

    per_segment_references = []
    role_tokens = []
    for references in zip(*reference_text):
        reference_list = []
        for reference in references:
            reference, role = process_dialogue_infer(reference.rstrip(),
                                                     get_role_token=True)
            reference_list.append(reference.split(" "))
        per_segment_references.append(reference_list)
        role_tokens.append(role)

    translations = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh:
        for line in fh:
            translations.append(line.rstrip().split(" "))

    results = {}
    bleu_score, _, _, _, _, _ = bleu.compute_bleu(per_segment_references,
                                                  translations, max_order,
                                                  smooth)
    results["all"] = 100 * bleu_score
    if mode == "brief":
        return results["all"]

    for role in ROLE_TOKENS:
        _sub_ref_texts = []
        _sub_trans = []
        for _r, _t, _role in zip(per_segment_references, translations,
                                 role_tokens):
            if _role == role:
                _sub_ref_texts.append(_r)
                _sub_trans.append(_t)
        bleu_score, _, _, _, _, _ = bleu.compute_bleu(_sub_ref_texts,
                                                      _sub_trans, max_order,
                                                      smooth)
        results[role] = 100 * bleu_score

    return results
示例#2
0
def _bleu(ref_file, trans_file):
    """Compute BLEU scores and handling BPE."""
    max_order = 4
    smooth = False

    ref_files = [ref_file]
    reference_text = []
    for reference_filename in ref_files:
        with codecs.getreader("utf-8")(tf.gfile.GFile(reference_filename,
                                                      "rb")) as fh:
            reference_text.append(fh.readlines())

    per_segment_references = []
    for references in zip(*reference_text):
        reference_list = []
        for reference in references:
            reference = process_dialogue_infer(reference)
            reference_list.append(reference.split(" "))
        per_segment_references.append(reference_list)

    translations = []
    with codecs.getreader("utf-8")(tf.gfile.GFile(trans_file, "rb")) as fh:
        for line in fh:
            translations.append(line.split(" "))

    bleu_score, _, _, _, _, _ = bleu.compute_bleu(per_segment_references,
                                                  translations, max_order,
                                                  smooth)
    return 100 * bleu_score
示例#3
0
def blue(trans,refs,subword_option=None):
    '''
    要求trans,ref必须是list,长度一样,一对一
    :param trans: list(str),shape(N,)
    :param ref:list(list(str)),shape(R,N),R是参考的数目
    :return: 
    '''
    if isinstance(refs,str):
        refs=load_file(refs,subword_option)

    transs=[_clean(line.strip(),subword_option).split(' ') for line in trans]

    refss=[]
    for reference in refs:
        refss.append([reference.strip().split(' ')])
    blue_score,_,_,_,_,_=compute_bleu(refss,transs,max_order=4,smooth=False)

    return 100*blue_score
def calculate_metrics_results(results: dict):
    for data in results.keys():
        references = []
        translations = []
        references_rouge = []
        translations_rouge = []
        wert = 0.0
        meteort = 0.0
        for video in results[data].keys():
            translation = results[data][video]["prediction_sentence"]
            if '</s>' in translation:
                translation.remove('</s>')
            translation = " ".join(translation)

            reference = results[data][video]["target_sentence"]
            if '</s>' in reference:
                reference.remove('</s>')
            reference = " ".join(reference)

            wert += jiwer.wer(truth=reference, hypothesis=translation)
            meteort += single_meteor_score(reference, translation)

            translations.append(translation.split(" "))
            translations_rouge.append(translation)

            references.append([reference.split(" ")])
            references_rouge.append(reference)

        print(len(references))
        rouge_score_map = rouge.rouge(translations_rouge, references_rouge)
        print(data + ' rouge: ' +
              str(100 * rouge_score_map["rouge_l/f_score"]))
        print(data + ' WER: ' + str((wert / len(references)) * 100))
        print(data + ' Meteor: ' + str((meteort / len(references)) * 100))
        for max_ in range(1, 5):
            bleu_score, _, _, _, _, _ = bleu.compute_bleu(references,
                                                          translations,
                                                          max_order=max_)
            print(data + ' bleu: ' + str(max_) + " " + str(bleu_score * 100))
示例#5
0
def train(data_train,
          data_val,
          data_test,
          model,
          loss_function,
          val_tgt_sentences,
          test_tgt_sentences,
          translator,
          start_epoch,
          ctx,
          tb_sw=None):
    """Training function.
    """

    trainer = gluon.Trainer(model.collect_params(), FLAGS.optimizer,
                            {'learning_rate': FLAGS.lr})

    train_data_loader, val_data_loader, test_data_loader = get_dataloaders(
        data_train, data_val, data_test)

    best_valid_bleu = 0.0
    for epoch_id in range(start_epoch, FLAGS.epochs):
        log_avg_loss = 0
        log_wc = 0
        log_start_time = time.time()
        for batch_id, (src_seq, tgt_seq, src_valid_length,
                       tgt_valid_length) in enumerate(train_data_loader):
            # if batch_id == len(train_data_loader)-1:
            #     break  # errors on last batch, jump out for now

            # put on the right ctx
            src_seq = src_seq.as_in_context(ctx)
            tgt_seq = tgt_seq.as_in_context(ctx)
            src_valid_length = src_valid_length.as_in_context(ctx)
            tgt_valid_length = tgt_valid_length.as_in_context(ctx)

            # calc the outs, the loss and back pass
            with mx.autograd.record():
                out, _ = model(src_seq, tgt_seq[:, :-1], src_valid_length,
                               tgt_valid_length - 1)
                loss = loss_function(out, tgt_seq[:, 1:],
                                     tgt_valid_length - 1).mean()
                loss = loss * (tgt_seq.shape[1] - 1) / (tgt_valid_length -
                                                        1).mean()
                loss.backward()

            # step the trainer and add up some losses
            trainer.step(1)
            src_wc = src_valid_length.sum().asscalar()
            tgt_wc = (tgt_valid_length - 1).sum().asscalar()
            step_loss = loss.asscalar()
            log_avg_loss += step_loss
            log_wc += src_wc + tgt_wc

            # log this batches statistics
            if tb_sw:
                tb_sw.add_scalar(tag='Training_loss',
                                 scalar_value=step_loss,
                                 global_step=(epoch_id * len(data_train) +
                                              batch_id * FLAGS.batch_size))
                tb_sw.add_scalar(tag='Training_ppl',
                                 scalar_value=np.exp(step_loss),
                                 global_step=(epoch_id * len(data_train) +
                                              batch_id * FLAGS.batch_size))

            if (batch_id + 1) % FLAGS.log_interval == 0:
                wps = log_wc / (time.time() - log_start_time)
                logging.info(
                    '[Epoch {} Batch {}/{}] loss={:.4f}, ppl={:.4f}  throughput={:.2f}K wps, wc={:.2f}K'
                    .format(epoch_id, batch_id + 1, len(train_data_loader),
                            log_avg_loss / FLAGS.log_interval,
                            np.exp(log_avg_loss / FLAGS.log_interval),
                            wps / 1000, log_wc / 1000))
                log_start_time = time.time()
                log_avg_loss = 0
                log_wc = 0

        # log embeddings
        if tb_sw:
            embs = mx.nd.array(list(range(len(
                data_train.vocab)))).as_in_context(ctx)
            embs = model.tgt_embed(embs)
            labs = data_train.vocab.idx_to_token
            tb_sw.add_embedding(mat=embs.asnumpy(),
                                metadata=labs,
                                global_step=(epoch_id * len(data_train) +
                                             batch_id * FLAGS.batch_size))

        # calculate validation and loss stats at end of epoch
        valid_loss, valid_translation_out = evaluate(val_data_loader, model,
                                                     loss_function, translator,
                                                     data_train, ctx)
        valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences],
                                                    valid_translation_out)
        # valid_met_score = meteor_score([[' '.join(sent)] for sent in val_tgt_sentences], [' '.join(sent) for sent in valid_translation_out])
        str_ = '[Epoch {}] valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'.format(
            epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100)

        nlgeval = NLGEval()
        metrics_dict = nlgeval.compute_metrics(
            [[' '.join(sent) for sent in val_tgt_sentences]],
            [' '.join(sent) for sent in valid_translation_out])

        for k, v in metrics_dict.items():
            str_ += ', valid ' + k + '={:.4f}'.format(float(v))
        logging.info(str_)

        # log the validation and loss stats
        if tb_sw:
            tb_sw.add_scalar(tag='Validation_loss',
                             scalar_value=valid_loss,
                             global_step=(epoch_id * len(data_train) +
                                          batch_id * FLAGS.batch_size))
            tb_sw.add_scalar(tag='Validation_ppl',
                             scalar_value=np.exp(valid_loss),
                             global_step=(epoch_id * len(data_train) +
                                          batch_id * FLAGS.batch_size))
            tb_sw.add_scalar(tag='Validation_bleu',
                             scalar_value=valid_bleu_score * 100,
                             global_step=(epoch_id * len(data_train) +
                                          batch_id * FLAGS.batch_size))
            tb_sw.add_text(tag='Validation Caps',
                           text_string=get_comp_str(val_tgt_sentences,
                                                    valid_translation_out),
                           global_step=(epoch_id * len(data_train) +
                                        batch_id * FLAGS.batch_size))

            for k, v in metrics_dict.items():
                tb_sw.add_scalar(tag='Validation_' + k,
                                 scalar_value=float(v),
                                 global_step=(epoch_id * len(data_train) +
                                              batch_id * FLAGS.batch_size))

        # also calculate the test stats
        test_loss, test_translation_out = evaluate(test_data_loader, model,
                                                   loss_function, translator,
                                                   data_train, ctx)
        test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences],
                                                   test_translation_out)
        # test_met_score = meteor_score([test_tgt_sentences], test_translation_out)
        str_ = '[Epoch {}] test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'.format(
            epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100)

        nlgeval = NLGEval()
        metrics_dict = nlgeval.compute_metrics(
            [[' '.join(sent) for sent in test_tgt_sentences]],
            [' '.join(sent) for sent in test_translation_out])

        for k, v in metrics_dict.items():
            str_ += ', test ' + k + '={:.4f}'.format(float(v))
        logging.info(str_)

        # and log the test stats
        if tb_sw:
            tb_sw.add_scalar(tag='Test_loss',
                             scalar_value=test_loss,
                             global_step=(epoch_id * len(data_train) +
                                          batch_id * FLAGS.batch_size))
            tb_sw.add_scalar(tag='Test_ppl',
                             scalar_value=np.exp(test_loss),
                             global_step=(epoch_id * len(data_train) +
                                          batch_id * FLAGS.batch_size))
            tb_sw.add_scalar(tag='Test_bleu',
                             scalar_value=test_bleu_score * 100,
                             global_step=(epoch_id * len(data_train) +
                                          batch_id * FLAGS.batch_size))
            tb_sw.add_text(tag='Test Caps',
                           text_string=get_comp_str(test_tgt_sentences,
                                                    test_translation_out),
                           global_step=(epoch_id * len(data_train) +
                                        batch_id * FLAGS.batch_size))

            for k, v in metrics_dict.items():
                tb_sw.add_scalar(tag='Test_' + k,
                                 scalar_value=float(v),
                                 global_step=(epoch_id * len(data_train) +
                                              batch_id * FLAGS.batch_size))

        # write out the validation and test sentences to files
        write_sentences(
            valid_translation_out,
            os.path.join('models', 'captioning', FLAGS.model_id,
                         'epoch{:d}_valid_out.txt').format(epoch_id))
        write_sentences(
            test_translation_out,
            os.path.join('models', 'captioning', FLAGS.model_id,
                         'epoch{:d}_test_out.txt').format(epoch_id))

        # save the model params if best
        if valid_bleu_score > best_valid_bleu:
            best_valid_bleu = valid_bleu_score
            save_path = os.path.join('models', 'captioning', 'experiments',
                                     FLAGS.model_id, 'valid_best.params')
            logging.info('Save best parameters to {}'.format(save_path))
            model.save_parameters(save_path)

        if epoch_id + 1 >= (FLAGS.epochs * 2) // 3:
            new_lr = trainer.learning_rate * FLAGS.lr_update_factor
            logging.info('Learning rate change to {}'.format(new_lr))
            trainer.set_learning_rate(new_lr)

        model.save_parameters(
            os.path.join('models', 'captioning', 'experiments', FLAGS.model_id,
                         "{:04d}.params".format(epoch_id)))

    # load and evaluate the best model
    if os.path.exists(
            os.path.join('models', 'captioning', 'experiments', FLAGS.model_id,
                         'valid_best.params')):
        model.load_parameters(
            os.path.join('models', 'captioning', 'experiments', FLAGS.model_id,
                         'valid_best.params'))

    valid_loss, valid_translation_out = evaluate(val_data_loader, model,
                                                 loss_function, translator,
                                                 data_train, ctx)
    valid_bleu_score, _, _, _, _ = compute_bleu([val_tgt_sentences],
                                                valid_translation_out)

    str_ = 'Best model valid Loss={:.4f}, valid ppl={:.4f}, valid bleu={:.2f}'.format(
        epoch_id, valid_loss, np.exp(valid_loss), valid_bleu_score * 100)

    nlgeval = NLGEval()
    metrics_dict = nlgeval.compute_metrics(
        [[' '.join(sent) for sent in val_tgt_sentences]],
        [' '.join(sent) for sent in valid_translation_out])

    for k, v in metrics_dict.items():
        str_ += ', valid ' + k + '={:.4f}'.format(float(v))
    logging.info(str_)

    test_loss, test_translation_out = evaluate(test_data_loader, model,
                                               loss_function, translator,
                                               data_train, ctx)
    test_bleu_score, _, _, _, _ = compute_bleu([test_tgt_sentences],
                                               test_translation_out)

    str_ = 'Best model test Loss={:.4f}, test ppl={:.4f}, test bleu={:.2f}'.format(
        epoch_id, test_loss, np.exp(test_loss), test_bleu_score * 100)

    nlgeval = NLGEval()
    metrics_dict = nlgeval.compute_metrics(
        [[' '.join(sent) for sent in test_tgt_sentences]],
        [' '.join(sent) for sent in test_translation_out])

    for k, v in metrics_dict.items():
        str_ += ', test ' + k + '={:.4f}'.format(float(v))
    logging.info(str_)

    write_sentences(
        valid_translation_out,
        os.path.join('models', 'captioning', 'experiments', FLAGS.model_id,
                     'best_valid_out.txt'))
    write_sentences(
        test_translation_out,
        os.path.join('models', 'captioning', 'experiments', FLAGS.model_id,
                     'best_test_out.txt'))