Пример #1
0
def calculate_bleu(data, src_field, trg_field, model, device, max_len=80):

    trgs = []
    pred_trgs = []
    trgs_filter = []
    pred_trgs_filter = []
    target_bleu_list = [
        'b@0', 'b@1', 'b@2', 'b@3', 'b@4', 'b@5', 'b@6', 'b@7', 'b@8', 'b@9'
    ]

    for datum in data:

        ques = vars(datum)['question']
        ans = vars(datum)['answer_text']
        doc = vars(datum)['article']
        trg = vars(datum)['distractor']
        bleu = vars(datum)['bleu1']

        print('ques = ' + ' '.join(ques))
        print('ans = ' + ' '.join(ans))

        trg = trg[1:]
        print('trg = ' + ' '.join(trg))
        max_bleu = 0
        max_pred_trg = ''
        for target_bleu in target_bleu_list:
            pred_trg, _ = translate_sentence(ans, ques, doc, bleu, src_field,
                                             trg_field, model, device,
                                             target_bleu, max_len)
            print(target_bleu + ' : ' + ' '.join(pred_trg))
            #cut off <eos> token, cut off special char "b@n"
            pred_trg = pred_trg[:-1]
            bleu_filter = bleu_score([pred_trg], [[ans]],
                                     max_n=1,
                                     weights=[1.0])
            print(bleu_filter)
            if bleu_filter > max_bleu:
                max_pred_trg = pred_trg
                max_bleu = bleu_filter
        pred_trgs.append(max_pred_trg)
        trgs.append([trg])
        print('predicted = ' + ' '.join(max_pred_trg))
        print()
        bleu_filter = bleu_score([pred_trg], [[ans]], max_n=1, weights=[1.0])
        if 0.2 < bleu_filter < 0.6:
            pred_trgs_filter.append(pred_trg)
            trgs_filter.append([trg])

    orinum = len(pred_trgs)
    newnum = len(pred_trgs_filter)
    print(f'original number = {orinum}')
    print(f'new number = {newnum}')
    return bleu_score(pred_trgs, trgs, max_n=1, weights=[1.0]), \
        bleu_score(pred_trgs, trgs, max_n=2, weights=[1.0/2]*2), \
        bleu_score(pred_trgs, trgs, max_n=3, weights=[1.0/3]*3), \
        bleu_score(pred_trgs, trgs, max_n=4, weights=[1.0/4]*4), \
        bleu_score(pred_trgs_filter, trgs_filter, max_n=1, weights=[1.0]), \
        bleu_score(pred_trgs_filter, trgs_filter, max_n=2, weights=[1.0/2]*2), \
        bleu_score(pred_trgs_filter, trgs_filter, max_n=3, weights=[1.0/3]*3), \
        bleu_score(pred_trgs_filter, trgs_filter, max_n=4, weights=[1.0/4]*4)
Пример #2
0
    def test_bleu_score(self):
        # Full match
        candidate = [['My', 'full', 'pytorch', 'test']]
        refs = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']]]
        assert bleu_score(candidate, refs) == 1

        # No 4-gram
        candidate = [['My', 'full', 'pytorch']]
        refs = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']]]
        assert bleu_score(candidate, refs) == 0

        # Partial match
        candidate = [['My', 'full', 'pytorch', 'test']]
        refs = [[['My', 'full', 'pytorch', 'test', '!'], ['Different']]]
        self.assertEqual(bleu_score(candidate, refs), 0.7788007)

        # Bigrams and unigrams only
        candidate = [['My', 'pytorch', 'test']]
        refs = [[['My', 'full', 'pytorch', 'test'], ['Different']]]
        self.assertEqual(bleu_score(candidate, refs, max_n=2,
                                    weights=[0.5, 0.5]), 0.5066641)

        # Multi-sentence corpus
        candidate = [['My', 'full', 'pytorch', 'test'], ['Another', 'Sentence']]
        refs = [[['My', 'full', 'pytorch', 'test'], ['Completely', 'Different']],
                [['No', 'Match']]]
        self.assertEqual(bleu_score(candidate, refs), 0.8408964)

        # Empty input
        candidate = [[]]
        refs = [[[]]]
        assert bleu_score(candidate, refs) == 0

        # Long input, compared to NLTK implementation score
        # nltl version used: 3.4.5
        candidate = [['Lucille', 'B', 'has', '3', 'sons'],
                     ['She', 'loves', 'all', 'her', 'children', 'equally'],
                     ['No', 'match', 'here', 'at', 'all']]

        refs = [[['I', 'heard', 'Lucille', 'has', 'three', 'sons'],
                ['Rumor', 'has', 'it', 'Lucille', 'has', '3', 'sons', '!']],
                [['I', 'love', 'all', 'my', 'children', 'equally'],
                ['She', 'loves', 'all', 'her', 'children', 'equally']],
                [['I', 'have', 'made', 'a', 'terrible', 'mistake'], ['Big', 'mistake']]]

        # The comments below give the code used to get each hardcoded bleu score
        # nltk.translate.bleu_score.corpus_bleu(refs, candidate)
        self.assertEqual(bleu_score(candidate, refs), 0.4573199)
        # nltk.translate.bleu_score.corpus_bleu(refs, candidate, weights=[0.33]*3)
        self.assertEqual(bleu_score(candidate, refs, 3,
                         weights=[0.33, 0.33, 0.33]), 0.4901113)
        # nltk.translate.bleu_score.corpus_bleu(refs, candidate, weights=[0.5]*2)
        self.assertEqual(bleu_score(candidate, refs, 2,
                         weights=[0.5, 0.5]), 0.5119535)
        # nltk.translate.bleu_score.corpus_bleu(refs, candidate, weights=[1])
        self.assertEqual(bleu_score(candidate, refs, 1,
                         weights=[1]), 0.5515605)
Пример #3
0
def test(data_loader, model, device, tokenizer, logger):

    model.eval()
    scores = []
    n_samples = len(data_loader)

    with tqdm(total=n_samples) as progress:
        sample = 0
        for videos, sentences in data_loader:
            if videos.shape[0] > MAX_SEQUENCE_LENGTH:
                continue
            decoder_input_ids = tokenizer.encode(sentences)
            decoder_input_ids = torch.tensor(decoder_input_ids)

            if data_loader.batch_size == 1:
                decoder_input_ids.unsqueeze_(0)
                videos.unsqueeze_(0)
            videos = videos.to(device)
            decoder_input_ids = decoder_input_ids.to(device)

            outputs = model(videos, decoder_input_ids)

            # Calculate BLEU score
            output_ids = torch.argmax(outputs[0], -1)
            output_sentences = tokenizer.decode(output_ids.view(-1))
            # Using PyTorch function (for more info: https://pytorch.org/text/data_metrics.html)
            scores.append(bleu_score(output_sentences, [sentences]))
            progress.update()
            sample += 1

        logger.info('Average BLEU score: {}'.format(np.array(scores).mean()))

    return scores
Пример #4
0
def calculate_bleu_alt(iterator,
                       src_field,
                       trg_field,
                       model,
                       device,
                       max_len=50):
    trgs = []
    pred_trgs = []
    with torch.no_grad():
        for batch in iterator:
            src = batch.src
            trg = batch.trg
            _trgs = []
            for sentence in trg:
                tmp = []
                # Start from the first token which skips the <start> token
                for i in sentence[1:]:
                    # Targets are padded. So stop appending as soon as a padding or eos token is encountered
                    if i == trg_field.vocab.stoi[
                            trg_field.eos_token] or i == trg_field.vocab.stoi[
                                trg_field.pad_token]:
                        break
                    tmp.append(trg_field.vocab.itos[i])
                _trgs.append([tmp])
            trgs += _trgs
            pred_trg, _ = translate_sentence_vectorized(
                src, src_field, trg_field, model, device)
            pred_trgs += pred_trg
    return pred_trgs, trgs, bleu_score(pred_trgs, trgs)
    def calculate_bleu_score(self,
                             test_dataset: Dataset,
                             max_len=128) -> float:
        trgs = []
        pred_trgs = []

        with torch.no_grad():
            for batch in tqdm(test_dataset.generate(self.config.batch_sz)):
                src = batch.src
                trg = batch.trg
                _trgs = []
                for sentence in trg:
                    tmp = []
                    for i in sentence[1:]:
                        if i == self.config.trg_vocab.eos_idx or\
                          i == self.config.trg_vocab.pad_idx:
                            break
                        tmp.append(self.config.trg_vocab.id_to_piece(i.item()))
                    _trgs.append([tmp])
                trgs += _trgs
                pred_trg, _ = self.translate_sentence_vectorized(
                    src, max_len=max_len)
                pred_trgs += pred_trg

        final_bleu_score = bleu_score(pred_trgs, trgs)
        logger.info(f'BLEU score = {final_bleu_score*100:.2f}')
        return pred_trgs, trgs, final_bleu_score
Пример #6
0
def calculate_bleu(data,
                   source_field: Field,
                   target_field: Field,
                   model: nn.Module,
                   device: str,
                   max_len=50) -> float:

    targets = []
    predicted_targets = []

    for datum in data:
        src = vars(datum)['src']
        trg = vars(datum)['trg']

        predicted_target = translate_sentence(
            sentence=src,
            source_field=source_field,
            target_field=target_field,
            model=model,
            device=device,
            max_len=max_len,
        )

        #cut off <eos> token
        predicted_target = predicted_target[:-1]

        predicted_targets.append(predicted_target)

        targets.append([trg])

    return bleu_score(predicted_targets, targets)
Пример #7
0
def show_bleu(data, SRC, TRG, model, device, logging = False, max_len=50):
    trgs = []
    pred_trgs = []
    index = 0

    for datum in data:
        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate_sentence(src, SRC, TRG, model, device, max_len, logging=False)

        # remove <eos>
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

        index+=1
        if (index + 1) % 100 == 0 and logging:
            print(f'[{index+1}/{len(data)}]')
            print(f'pred: {pred_trg}')
            print(f'answer: {trg}')
    bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25])
    print(f'Total BLEU Score = {bleu*100:.2f}')
    sys.stdout.flush()
Пример #8
0
def calculate_bleu(dataset, src_field, trg_field, model, device, max_len=50):

    trgs = []
    pred_trgs = []

    for data in dataset:

        src = vars(data)['QnA']
        #src = src.permute(1,0)
        trg = vars(data)['Ans_Sen']
        #trg = trg.permute(1,0)
        #print(trg)

        pred_trg = rnn_predict(src, src_field, trg_field, model, device,
                               max_len)

        #print(pred_trg)

        #cut off <eos> token
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

    return bleu_score(pred_trgs, trgs)
Пример #9
0
def calculate_bleu_score(model, dataloader, german_word_to_idx,
                         english_idx_to_word, device):
    model.eval()
    predicted_sentences = []
    target_sentences = []
    with torch.no_grad():
        for num, d in tqdm(enumerate(dataloader), total=len(dataloader)):
            german_idx = d['german_idx'].to(device)
            english_idx = d['english_idx'].to(device)

            predicted_english_idx = model(german_idx,
                                          english_idx,
                                          teacher_force_ratio=10)
            english_idx = english_idx.detach().cpu().numpy()
            predicted_english_idx = torch.softmax(predicted_english_idx,
                                                  dim=-1)
            predicted_english_idx = predicted_english_idx.argmax(-1)
            predicted_english_idx = predicted_english_idx.detach().cpu().numpy(
            )

            for num in range(len(predicted_english_idx)):
                target_idx = english_idx[num]
                output = predicted_english_idx[num]

                predicted_sentence = decode(output, english_idx_to_word)
                predicted_sentences.append(predicted_sentence)

                target_sentence = decode(target_idx, english_idx_to_word)
                target_sentences.append([target_sentence])

    return bleu_score(predicted_sentences, target_sentences)
Пример #10
0
def main():
    parser = argparse.ArgumentParser(description='translate.py')

    parser.add_argument('-data_pkl',
                        required=True,
                        help='Pickle file with vocabulary.')
    parser.add_argument('-trg_data', default='PSLG-PC12/ENG-ASL_Test.en')
    parser.add_argument('-pred_data',
                        default='predictions.txt',
                        help="""Path to output the predictions (each line will
                        be the decoded sequence""")
    opt = parser.parse_args()

    data = pickle.load(open(opt.data_pkl, 'rb'))
    SRC, TRG = data['vocab']['src'], data['vocab']['trg']

    fields = [('src', SRC)]

    with open(opt.trg_data, 'r') as f:
        trg_loader = Dataset(
            examples=[Example.fromlist([x], fields) for x in f],
            fields={'src': SRC})
    trg_txt = [x.src for x in trg_loader]

    with open(opt.pred_data, 'r') as f:
        pred_loader = Dataset(
            examples=[Example.fromlist([x], fields) for x in f],
            fields={'src': SRC})
    pred_txt = [[x.src] for x in pred_loader]

    score = bleu_score(trg_txt, pred_txt)
    print('Bleu 4 score is {}'.format(str(score)))

    with open('bleu_score.txt', 'w') as f:
        f.write('Bleu 4 score is {}'.format(str(score)))
Пример #11
0
def calculate_bleu_score(candidate, reference, lang, max_n=2):
    """
    Input:
    - candidate: numpy array or pytorch tensor of shape (batch, max_seq_len1)
    - reference : numpy array or pytorch tensor of shape (batch, 5, max_seq_len2)
    - lang: Lang class instance that can be used to decode numerical captions
    
    Output:
    - scores: list containing BLEU scores for each sample, where len(scores) == batch
    """
    if isinstance(candidate, torch.Tensor):
        candidate = candidate.cpu().numpy()
    if isinstance(reference, torch.Tensor):
        reference = reference.cpu().numpy()

    scores = []
    for cand, ref_list in zip(
            candidate,
            reference):  # calculate the BLEU score for all items in the batch
        cand = [lang.decode_caption(cand).split()]
        ref_list = [[lang.decode_caption(ann).split() for ann in ref_list]]
        score = bleu_score(cand,
                           ref_list,
                           max_n=max_n,
                           weights=[1 / max_n for _ in range(max_n)])
        scores.append(score * 100)

    return scores
Пример #12
0
def compute_metrics(hyp_dec_all,
                    ref_dec_all,
                    use_sacrebleu=True,
                    use_torchtext=True,
                    use_ter=False):
    metrics = {}

    # Sacrebleu
    if use_sacrebleu:
        metrics["sacrebleu_rawcorpusbleu"] = sacrebleu.raw_corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_bleu"] = sacrebleu.corpus_bleu(
            hyp_dec_all, [ref_dec_all]).score
        metrics["sacrebleu_chrf"] = sacrebleu.corpus_chrf(
            hyp_dec_all, [ref_dec_all]).score
        if use_ter:  # Quite slow
            metrics["sacrebleu_ter"] = sacrebleu.corpus_ter(
                hyp_dec_all, [ref_dec_all]).score

    # Torchtext
    if use_torchtext:
        m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all],
                                  [[x.split(" ")] for x in ref_dec_all])
        metrics["torchtext_bleu"] = m_bleu_score * 100
    return metrics
Пример #13
0
def bleu(data, model, german, english, device, syntax_embedding_size):
    targets = []
    outputs = []

    count = 0
    for example in data:
        count += 1
        src = vars(example)["src"]
        trg = vars(example)["trg"]

        prediction = translate_sentence(model, src, german, english, device,
                                        syntax_embedding_size)
        prediction = prediction  #[:-1]  # remove <eos> token
        src = german.decode(src)
        trg = english.decode(trg)
        print(count, "src   :>>>", src)
        print("target:   ", trg)
        print("pred  :   ", prediction)
        targets.append([trg.split()])
        outputs.append(prediction.split())

    print("calc blue 1")
    #print(outputs)
    print("====================")
    #print(targets)
    #exit()
    blue = bleu_score(outputs, targets)
    #blue = bleu_score(targets, targets)

    print("calc blue 2")
    return blue
Пример #14
0
def calculate_bleu(data, src_field, trg_field, model, device, max_len=50):
    print('*' * 40, ' Calculating BLEU ', '*' * 40)
    trgs = []
    pred_trgs = []
    pred_trgs_out = []
    for datum in data:

        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model,
                                         device, max_len)

        #cut off <eos> token
        pred_trg = pred_trg[:-1]
        pred_trg_out = ' '.join(pred_trg)
        pred_trgs.append(pred_trg)
        trgs.append([trg])
        pred_trgs_out.append(pred_trg_out)

    with open(file='./output/test_output.csv', mode='w') as f:
        writer = csv.writer(f,
                            delimiter='\n',
                            quotechar='\"',
                            quoting=csv.QUOTE_MINIMAL)
        writer.writerows([pred_trgs_out])

    return bleu_score(pred_trgs, trgs)
Пример #15
0
def bleu(data, model, source_lang, target_lang, device, max_length,
         generate_outputs):
    targets = []
    outputs = []
    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]
        #print("trg=========>",trg)
        #print("src==",src)
        prediction = Translate(model, src, source_lang, target_lang, device,
                               max_length)
        #print("predictions===",prediction)
        # remove <sos> <eos> token
        prediction = prediction[1:-1]

        targets.append([trg])
        outputs.append(prediction)

    if generate_outputs:
        writer = [' '.join(s) for s in outputs]
        with open("Outputs/testset_translated.txt", 'w',
                  encoding='utf-8') as op:
            for sent in writer[:-1]:
                op.write(sent + '\n')
            op.write(writer[-1])
    #print("org outputs-->",outputs)
    #print("org tagets-->",targets)
    return bleu_score(outputs, targets)
Пример #16
0
def calculate_bleu_score(ground_truth, predictions):
    for i in range(len(ground_truth)):
        ground_truth[i] = [ground_truth[i].split(" ")]

    for i in range(len(predictions)):
        predictions[i] = predictions[i].split(' ')

    return bleu_score(predictions, ground_truth)
Пример #17
0
def show_train_info(epoch,
                    start_time,
                    end_time,
                    train_loss,
                    valid_loss,
                    metric='perplexit',
                    **kwargs):
    # kwargs for bleu_score:
    # https://pytorch.org/text/data_metrics.html?highlight=bleu#torchtext.data.metrics.bleu_score
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    train_score = math.exp(
        train_loss) if metric == 'perplexity' else bleu_score(**kwargs)
    valid_score = math.exp(
        valid_loss) if metric == 'perplexity' else bleu_score(**kwargs)

    print(f'Epoch: {epoch:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train BLEU: {train_score:7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. BLEU: {valid_score:7.3f}')
Пример #18
0
def count_bleu(output, trg, TRG):
    # output shape: seq_len * batch_size * feature 
    # trg shape: seq_len * batch_size
    # corpus level or sentence level bleu ?
    output = output.permute(1,0,2).max(2)[1]
    trg = trg.permute(1,0)
    candidate_corpus = [itos(idx_list, TRG) for idx_list in output]
    references_corpus = [[itos(idx_list, TRG)] for idx_list in trg]
    return bleu_score(candidate_corpus, references_corpus)
Пример #19
0
def google_bleu_score(gtrans, ref):
    m1 = []
    m2 = []
    for sent in gtrans:
        m1.append(sent.split())
    for sent in ref:
        m2.append([sent.split()])

    r = bleu_score(m1, m2)
    return r
Пример #20
0
def evaluate(model: Model, dataset: Im2LatexDataset, args: Munch, num_batches: int = None, name: str = 'test'):
    """evaluates the model. Returns bleu score on the dataset

    Args:
        model (torch.nn.Module): the model
        dataset (Im2LatexDataset): test dataset
        args (Munch): arguments
        num_batches (int): How many batches to evaluate on. Defaults to None (all batches).
        name (str, optional): name of the test e.g. val or test for wandb. Defaults to 'test'.

    Returns:
        bleu_score: BLEU score of validation set.
    """
    assert len(dataset) > 0
    device = args.device
    log = {}
    bleus, edit_dists = [], []
    bleu_score, edit_distance = 0, 1
    pbar = tqdm(enumerate(iter(dataset)), total=len(dataset))
    for i, (seq, im) in pbar:
        if seq is None or im is None:
            continue
        tgt_seq, tgt_mask = seq['input_ids'].to(device), seq['attention_mask'].bool().to(device)
        encoded = model.encoder(im.to(device))
        #loss = decoder(tgt_seq, mask=tgt_mask, context=encoded)
        dec = model.decoder.generate(torch.LongTensor([args.bos_token]*len(encoded))[:, None].to(device), args.max_seq_len,
                                     eos_token=args.pad_token, context=encoded, temperature=args.get('temperature', .2))
        pred = detokenize(dec, dataset.tokenizer)
        truth = detokenize(seq['input_ids'], dataset.tokenizer)
        bleus.append(metrics.bleu_score(pred, [alternatives(x) for x in truth]))
        for predi, truthi in zip(token2str(dec, dataset.tokenizer), token2str(seq['input_ids'], dataset.tokenizer)):
            ts = post_process(truthi)
            edit_dists.append(distance(post_process(predi), ts)/len(ts))
        pbar.set_description('BLEU: %.3f, ED: %.2e' % (np.mean(bleus), np.mean(edit_dists)))
        if num_batches is not None and i >= num_batches:
            break
    if len(bleus) > 0:
        bleu_score = np.mean(bleus)
        log[name+'/bleu'] = bleu_score
    if len(edit_dists) > 0:
        edit_distance = np.mean(edit_dists)
        log[name+'/edit_distance'] = edit_distance
    if args.wandb:
        # samples
        pred = token2str(dec, dataset.tokenizer)
        truth = token2str(seq['input_ids'], dataset.tokenizer)
        table = wandb.Table(columns=["Truth", "Prediction"])
        for k in range(min([len(pred), args.test_samples])):
            table.add_data(post_process(truth[k]), post_process(pred[k]))
        log[name+'/examples'] = table
        wandb.log(log)
    else:
        print('\n%s\n%s' % (truth, pred))
        print('BLEU: %.2f' % bleu_score)
    return bleu_score, edit_distance
Пример #21
0
 def eval_bleu(self, test_cases: Dict = None, silent: bool = False):
     if test_cases is None:
         test_cases = self.dataset.validation_stuff()
     src_sentences: List[List[int]] = [
         pair["source_val"] for pair in test_cases
     ]
     candidates = self.__eval_multiple(src_sentences, 300, silent)
     candidates = [[self.dataset.target_value2token(v) for v in s]
                   for s in candidates]
     sentences_refs = [pair["targets_tokens"] for pair in test_cases]
     return bleu_score(candidates, sentences_refs)
Пример #22
0
def bleu(valid_src_data, valid_trg_data, model, SRC, TRG, device, k,
         max_strlen):
    pred_sents = []
    for sentence in valid_src_data:
        pred_trg = translate_sentence(sentence, model, SRC, TRG, device, k,
                                      max_strlen)
        pred_sents.append(pred_trg)

    pred_sents = [TRG.preprocess(sent) for sent in pred_sents]
    trg_sents = [[sent.split()] for sent in valid_trg_data]

    return bleu_score(pred_sents, trg_sents)
Пример #23
0
def estimateBLEU(model, set_name, word2idx_dict, idx2word_dict, cw_idx_list,
                 qw_idx_list, device):
    for i in range(1, 5):
        candidate_corpus, references_corpus = FindCandidatesAndReferencesForBLEU(
            model, word2idx_dict, idx2word_dict, cw_idx_list, qw_idx_list,
            device)
        bleu_test = bleu_score(candidate_corpus,
                               references_corpus,
                               max_n=i,
                               weights=[1. / i] * i)
        print("BLEU-" + str(i) + " on " + set_name + " :" + str(bleu_test))
    return
Пример #24
0
def bleu(data, model, receptors, ligands, device):
    print("=> Calculating Bleu")
    targets = []
    outputs = []
    for example in data:
        src = vars(example)["src"]
        trg = vars(example)["trg"]
        prediction = translate_sentence(model, src, receptors, ligands, device)
        prediction = list(map(str, prediction[:-1]))  # remove <eos> token
        outputs.append(prediction)
        targets.append(list(map(str, trg)))
    return bleu_score(outputs, targets)
Пример #25
0
def show_bleu(data, src_field, trg_field, model, device, max_len=50):
    trgs = []
    pred_trgs = []
    index = 0

    for datum in data:
        src = vars(datum)['src']
        trg = vars(datum)['trg']

        pred_trg, _ = translate_sentence(src, src_field, trg_field, model, device, max_len, logging=False)

        # 마지막 <eos> 토큰 제거
        pred_trg = pred_trg[:-1]

        pred_trgs.append(pred_trg)
        trgs.append([trg])

        index += 1
        if (index + 1) % 100 == 0:
            print(f"[{index + 1}/{len(data)}]")
            print(f"예측: {pred_trg}")
            print(f"정답: {trg}")

    bleu = bleu_score(pred_trgs, trgs, max_n=4, weights=[0.25, 0.25, 0.25, 0.25])
    print(f'Total BLEU Score = {bleu * 100:.2f}')

    individual_bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0])
    individual_bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 1, 0, 0])
    individual_bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 1, 0])
    individual_bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[0, 0, 0, 1])

    print(f'Individual BLEU1 score = {individual_bleu1_score * 100:.2f}')
    print(f'Individual BLEU2 score = {individual_bleu2_score * 100:.2f}')
    print(f'Individual BLEU3 score = {individual_bleu3_score * 100:.2f}')
    print(f'Individual BLEU4 score = {individual_bleu4_score * 100:.2f}')

    cumulative_bleu1_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1, 0, 0, 0])
    cumulative_bleu2_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1 / 2, 1 / 2, 0, 0])
    cumulative_bleu3_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1 / 3, 1 / 3, 1 / 3, 0])
    cumulative_bleu4_score = bleu_score(pred_trgs, trgs, max_n=4, weights=[1 / 4, 1 / 4, 1 / 4, 1 / 4])

    print(f'Cumulative BLEU1 score = {cumulative_bleu1_score * 100:.2f}')
    print(f'Cumulative BLEU2 score = {cumulative_bleu2_score * 100:.2f}')
    print(f'Cumulative BLEU3 score = {cumulative_bleu3_score * 100:.2f}')
    print(f'Cumulative BLEU4 score = {cumulative_bleu4_score * 100:.2f}')
def log_progress(epoch_i,
                 start_time,
                 tr_loss,
                 val_loss,
                 translations=None,
                 tb_writer=None):
    metrics = {
        "train": {
            "loss": tr_loss,
            "ppl": math.exp(tr_loss),
        },
        "val": {
            "loss": val_loss,
            "ppl": math.exp(val_loss),
        },
    }

    # Get additional metrics
    if translations:
        src_dec_all, hyp_dec_all, ref_dec_all = translations
        m_bleu_score = bleu_score([x.split(" ") for x in hyp_dec_all],
                                  [[x.split(" ")] for x in ref_dec_all])
        metrics["val"]["bleu"] = m_bleu_score * 100

        # Print translations
        helpers.print_translations(hyp_dec_all,
                                   ref_dec_all,
                                   src_dec_all,
                                   limit=50)

    # Print stuff
    end_time = time.time()
    epoch_hours, epoch_mins, epoch_secs = helpers.epoch_time(
        start_time, end_time)
    print("------------------------------------------------------------")
    print(f'Epoch: {epoch_i + 1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(
        f'\t- Train Loss: {metrics["train"]["loss"]:.3f} | Train PPL: {metrics["train"]["ppl"]:.3f}'
    )
    print(
        f'\t- Val Loss: {metrics["val"]["loss"]:.3f} | Val PPL: {metrics["val"]["ppl"]:.3f} | Val BLEU: {metrics["val"]["bleu"]:.3f}'
    )
    print("------------------------------------------------------------")

    # Tensorboard
    if tb_writer:
        for split in ["train", "val"]:
            for k, v in metrics[split].items():
                tb_writer.add_scalar(f'{split}_{k.lower()}', v, epoch_i + 1)
                wandb.log({f'{split}_{k.lower()}': v})

    return metrics
Пример #27
0
def cust_bleu(output_path, target_path):
    spacy_eng = spacy.load("en")

    output_data = open(output_path, encoding='utf8').read().split('\n')
    target_data = open(target_path, encoding='utf8').read().split('\n')
    outputs = [s.lower().split(' ') for s in output_data]
    targets = []
    for sent in target_data:
        targets.append([[tok.text.lower() for tok in spacy_eng(sent)]])

    #print("cust outputs-->",outputs)
    #print("cust tagets-->",targets)
    return bleu_score(outputs, targets)
Пример #28
0
def bleu(data, model, japanese, english, device):
    targets = []
    outputs = []

    for example in data:
        src = vars(example)["jap"]
        trg = vars(example)["eng"]

        prediction = translate_sentence(model, src, japanese, english, device)
        prediction = prediction[:-1]  # remove <eos> token
        targets.append([trg])
        outputs.append(prediction)

    return bleu_score(outputs, targets)
Пример #29
0
def test_model(test_loader, model, device):
    reference = []
    pred_list = []
    model.eval()
    for data in test_loader:
        src, tgt, src_len, tgt_len = data
        preds = model(src, None, device)
        preds = preds.max(2)[1].transpose(1, 0)
        ref = tgt.transpose(1, 0)
        ref = id2word(ref)
        reference += [[r] for r in ref]
        pred_list += id2word(preds)
    score = bleu_score(pred_list, reference)
    return score
Пример #30
0
def count_bleu(output, trg, TRG):
    # output shape: [T, N, E]
    # trg shape: [T, N]
    # using corpus level
    output = output.permute(1, 0, 2).max(2)[1]
    trg = trg.permute(1, 0)
    
    mask = trg.ne(TRG.vocab.stoi['<pad>'])
    output = output.masked_select(mask)
    trg = trg.masked_select(mask)
    candidate_corpus = [itos(output, TRG)]
    references_corpus = [[itos(trg, TRG)]]
    
    return bleu_score(candidate_corpus, references_corpus)