def test_corpus_bleu(): hy = "I have a pen" hy = word_tokenize(hy) re = "I have a apple" re = word_tokenize(re) res = [re] assert math.isclose(bleu_score.sentence_bleu(res, hy), corpus_bleu([hy], [res]))
def evaluate(model, val_loader, criterion, optimizer, device, writer): """Evaluate model for 1 epoch. Inputs: model, val_loader, criterioin, optimizer, device, writer model: The model to be evaluated. val_loader: DataLoader of validation Dataset. criterion: Loss function. optimizer: Optimizer of model. device: Pytorch device. writer: Tensorboard summary writer. Outputs: loss, score loss: Loss of current epoch. score: Bleu score of current epoch. """ total_loss = 0 total_length = 0 total_score = 0 total_num = 0 num_batchs = len(val_loader) model.eval() with torch.no_grad(): for batch in val_loader: sequence_en, sequence_de, seq_len_en, seq_len_de = batch sequence_en = sequence_en.to(device) sequence_de = sequence_de.to(device) shifted_sequence_de = sequence_de[:, :-1] _, pad_mask_en = create_mask(sequence_en) pad_mask_en = pad_mask_en.to(device) future_mask, pad_mask_de = create_mask(shifted_sequence_de) future_mask = future_mask.to(device) pad_mask_de = pad_mask_de.to(device) logit = model(sequence_en, shifted_sequence_de, future_mask, pad_mask_en, pad_mask_de) loss = criterion(input=logit.contiguous().view(-1, logit.size(-1)), target=sequence_de[:, 1:].contiguous().view(-1)) length = sum(seq_len_de) - len(seq_len_de) total_loss += loss total_length += length batch_score = 0 for b, target in enumerate(sequence_de): predict = torch.argmax(logit[b, :seq_len_de[b] - 1, :], dim=1) batch_score += bleu_score.sentence_bleu( [target[1:seq_len_de[b]].cpu().numpy()], predict.cpu().numpy(), smoothing_function=bleu_smoothing) total_num += 1 total_score += batch_score return total_loss / total_length, total_score / total_num
def calc_bleu(self, gen_sentence, ans_sentence): if type(gen_sentence) == str: gen_sentence = gen_sentence.split(" ") if type(ans_sentence) == str: ans_sentence = ans_sentence.split(" ") anses = [ans_sentence] BLEUscore = bleu_score.sentence_bleu(anses, gen_sentence, weights=(0.5, 0.5)) return BLEUscore
def bleu(pred, answer, mode="1-gram"): if mode == "1-gram": weights = [1.0] elif mode == "2-gram": weights = [0.5, 0.5] elif mode == "3-gram": weights = [0.3, 0.3, 0.3] elif mode == "4-gram": weights = [0.25, 0.25, 0.25, 0.25] else: sys.stdout.write("Not support mode") sys.exit() return bleu_score.sentence_bleu([pred], answer, weights=weights)
def compute_bleu_score(predictedCaptions, trueCaptions, mode="4-gram"): if mode == "1-gram": weights = [1.0] elif mode == "2-gram": weights = [0.5, 0.5] elif mode == "3-gram": weights = [0.33, 0.33, 0.33] elif mode == "4-gram": weights = [0.25, 0.25, 0.25, 0.25] else: sys.stdout.write("Not support mode") sys.exit() return bleu_score.sentence_bleu([predictedCaptions], trueCaptions, weights=weights)
def compute_sentences_ranking(video_captions): """ returns [(sentence0, similarity), ..., (sentence19, similarity)] """ sentences_global_ranking = [] if config.experiment == 'experiment1': bfs = True embeddings = [] labels = [] for sentence in video_captions.sentences: sentence_embedding = sentence.get_sentence_embedding(bfs) if len( sentence_embedding ) > 0: # there are sentences without senses (i.e. 'its a t') --> no embedding! embeddings.append(sentence_embedding) labels.append(sentence.sentence) embeddings_mean = np.mean(embeddings, axis=0) distances = [ scipy.spatial.distance.cosine(embedding, embeddings_mean) for embedding in embeddings ] for i, distance in enumerate(distances): sentences_global_ranking.append( (video_captions.sentences[i].sentence, distance)) elif config.experiment == 'experiment5': chencherry = SmoothingFunction() for i, sentence1 in enumerate(video_captions.sentences): scores = [ bleu_score.sentence_bleu([sentence2.sentence.split(' ')], sentence1.sentence.split(' '), smoothing_function=chencherry.method4) for j, sentence2 in enumerate(video_captions.sentences) ] # if i != j] # if we add 1 to all, result shouldn't change score = sum(scores) / len(scores) sentences_global_ranking.append((sentence1.sentence, score)) else: result = np.zeros([20, 20]) for i, sentence1 in enumerate(video_captions.sentences): for j, sentence2 in enumerate(video_captions.sentences): similarities = [] for token1_id in sentence1.tokens_id_list: # find most similar token to sentence1.token1 in sentence2.tokens most_similar_token_in_sentence = (None, float('-inf')) for token2_id in sentence2.tokens_id_list: if (token1_id, token2_id ) in config.tokens_set.tokens_similarities_closest: similarity = config.tokens_set.tokens_similarities_closest[ (token1_id, token2_id)] if similarity > most_similar_token_in_sentence[1]: most_similar_token_in_sentence = (token2_id, similarity) # store token similarity (depending on the experiments we check if it is over threshold) if most_similar_token_in_sentence[0] is not None: if config.experiment in [ 'experiment4', 'experiment4symmetrical' ]: if most_similar_token_in_sentence[1] > config.th1: similarities.append( (most_similar_token_in_sentence[0], 1.0) ) # for each token we add 1 instead of similarity else: similarities.append((None, 0)) elif config.experiment == 'experiment3': if most_similar_token_in_sentence[1] > config.th1: similarities.append( most_similar_token_in_sentence) else: similarities.append((None, 0)) elif config.experiment == 'experiment2': similarities.append(most_similar_token_in_sentence) # compute and store similarity between sentence1 and sentence2 if len(similarities) > 0: sentences_similarity = float( sum([a[1] for a in similarities])) / len(similarities) else: sentences_similarity = 0 result[i, j] = sentences_similarity # we make the similarities symmetrical if config.experiment == 'experiment4symmetrical': for i in range(0, len(result)): for j in range(0, len(result)): symmetric_similarity = 0 if result[i, j] + result[j, i] != 0: symmetric_similarity = (result[i, j] + result[j, i]) / 2 result[i, j] = symmetric_similarity result[j, i] = symmetric_similarity # compute sentences similarity to all others (array of size 20) sentences_similarities = (np.sum(result, axis=1)) / result.shape[ 1] # sentences similarities normalized between 0 and 1 for i, similarity in enumerate(sentences_similarities): sentences_global_ranking.append( (video_captions.sentences[i].sentence, similarity)) return sentences_global_ranking
#!/usr/bin/env python # -*- coding: utf8 -*- # for python3 # txt1にreference、txt2にMT outputを入れてsentence_BLEUを取得 from nltk import word_tokenize from nltk import bleu_score from nltk.translate.bleu_score import SmoothingFunction cc = SmoothingFunction() txt1 = open("txt1.txt", encoding='utf-8').read().splitlines() txt2 = open("txt2.txt", encoding='utf-8').read().splitlines() l = len(txt1) b = [1] * l for i in range(l): ref = word_tokenize(txt1[i]) hyp = word_tokenize(txt2[i]) b[i] = str(bleu_score.sentence_bleu([ref], hyp, smoothing_function=cc.method7)) f = open('bleu.txt', 'w') b = "\n".join(b) f.write(b) f.close()
def train(model, train_loader, criterion, optimizer, device, writer, epoch, print_steps): """Train model for 1 epoch. Inputs: model, train_loader, criterioin, optimizer, device, writer, epoch, print_steps model: The model to be trained. train_loader: DataLoader of train Dataset. criterion: Loss function. optimizer: Optimizer of model. device: Pytorch device. writer: Tensorboard summary writer. epoch: Index of current epoch. print_steps: Interval of steps to print log. Outputs: loss, score loss: Loss of current epoch. score: Bleu score of current epoch. """ total_loss = 0 total_length = 0 # sum of lengths of sequences total_score = 0 total_num = 0 # number of datas step = 0 num_batchs = len(train_loader) model.train() for batch in train_loader: optimizer.zero_grad() # learning rate schedule for param in optimizer.param_groups: param["lr"] = learning_rate_schedule(train.global_step) sequence_en, sequence_de, seq_len_en, seq_len_de = batch sequence_en = sequence_en.to(device) sequence_de = sequence_de.to(device) # except <EOS> token (or PAD) shifted_sequence_de = sequence_de[:, :-1] _, pad_mask_en = create_mask(sequence_en) pad_mask_en = pad_mask_en.to(device) future_mask, pad_mask_de = create_mask(shifted_sequence_de) future_mask = future_mask.to(device) pad_mask_de = pad_mask_de.to(device) # logit: [batch, time, vocab] logit = model(sequence_en, shifted_sequence_de, future_mask, pad_mask_en, pad_mask_de) loss = criterion(input=logit.contiguous().view(-1, logit.size(-1)), target=sequence_de[:, 1:].contiguous().view(-1)) # except <SOS> token length = sum(seq_len_de) - len(seq_len_de) total_loss += loss total_length += length """calculate bleu score""" batch_score = 0 for b, target in enumerate(sequence_de): # target, predict: [time] predict = torch.argmax(logit[b, :seq_len_de[b] - 1, :], dim=1) batch_score += bleu_score.sentence_bleu( [target[1:seq_len_de[b]].cpu().numpy()], predict.cpu().numpy(), smoothing_function=bleu_smoothing) total_num += 1 """""" total_score += batch_score loss.backward() optimizer.step() if step % print_steps == 0: print( "epoch: {}/{}, batch: {}/{}, loss: {}, bleu score: {}".format( epoch, hparams.max_epochs, step + 1, num_batchs, loss / length, batch_score / len(seq_len_de))) # update graph in tensorboard writer.add_scalar("Loss", loss / length, train.global_step) writer.add_scalar("Bleu score", batch_score / len(seq_len_de), train.global_step) step += 1 train.global_step += 1 # return loss & bleu_score of epoch return total_loss / total_length, total_score / total_num