def compute(filename, topics=2): doc = Document(LDATester.PATH + filename + ".txt") gold_doc = Document(LDATester.PATH + filename + "_gold.txt") topics = len(gold_doc.sentences) ldaSummary = LDATester.getSummary(doc, topics) # print ldaSummary return BLEU.computeNormalize(gold_doc.document, ldaSummary, ignore=True)
def model_eval(epoch_file): print('=============================================') print() #Getting model's information model_version_number = epoch_file.split('/')[5].split('_')[1] print('Testing model version: ', model_version_number) #Loading the model model = CaptionGenerator.load_from_checkpoint(checkpoint_path=epoch_file, pad_idx=pad_idx) model.eval() with open(r'../../data/caption_generator/lightning_logs/version_' + model_version_number + '/hparams.yaml') as file: parameters = yaml.load(file, Loader=yaml.FullLoader) print('With parameters: ', parameters) captions = [ " ".join(model.caption_image(image, dataset.vocab)[1:-1]) for image in imgs ] # Putting the file names and their corresponding captions together in a DataFrame to then save as .tsv df = pd.DataFrame(data={'image': file_names, 'caption': captions}) df.to_csv('../../data/caption_generator/version_' + model_version_number + '_outputs.tsv', index=False, sep='\t') #Generating BLEU scores evaluation = BLEU('../../data/caption_generator/version_' + model_version_number + '_outputs.tsv') azul = evaluation.get_bleu_score() #Generating captions for the selected examples examples = get_examples(model, dataset) print('The model achieved the following performance on the test set: ') print('BLEU-4 average (rounded) score: ' + '{:.3f}'.format(azul)) print() print('=============================================') print() return model_version_number, parameters, azul, examples
def evaluate(loader, seq2seq, criterion, max_len): losses = utils.AverageMeter() ppls = utils.AverageMeter() seq2seq.eval() bleu = BLEU() tot_st = time.time() bleu_time = 0. with torch.no_grad(): for i, example in enumerate(loader): src, src_lens, tgt, tgt_lens = parse(example) B = src.size(0) dec_outs, attn_ws = seq2seq(src, src_lens, tgt, tgt_lens, teacher_forcing=0.) loss, ppl = criterion(dec_outs, tgt[:, 1:]) losses.update(loss, B) ppls.update(ppl, B) # BLEU bleu_st = time.time() # convert logits to preds preds = dec_outs.max(-1)[1] # get pred lens by finding EOS token pred_lens = get_lens(preds, max_len) for pred, target, pred_len, target_len in zip( preds, tgt, pred_lens, tgt_lens): # target_len include SOS & EOS token => 1:target_len-1. bleu.add_sentence(pred[:pred_len].cpu().numpy(), target[1:target_len - 1].cpu().numpy()) bleu_time += time.time() - bleu_st total_time = time.time() - tot_st logger.debug("TIME: tot = {:.3f}\t bleu = {:.3f}".format( total_time, bleu_time)) return losses.avg, ppls.avg, bleu.score()
def compute(filename): gold_doc = Document(LDATester.PATH + filename + "_gold.txt") doc = Document(LDATester.PATH + filename + ".txt") ## Get random summary indices = [x for x in range(len(doc.sentences))] random.shuffle(indices) indices = indices[0 : len(gold_doc.sentences)] sentences = [doc.sentences[i] for i in indices] calibration = [doc.getSentenceOrginal(sentence) for sentence in sentences] calibration = " ".join(calibration) return BLEU.computeNormalize(gold_doc.document, calibration)
def test_bleu(self, N=300, gram=4): all_score = [] for i in range(N): input_indices = self.show(self.dp.X_test[i], self.dp.X_id2w) o = self.model.infer(input_indices)[0] refer4bleu = [[ ' '.join( [self.dp.Y_id2w.get(w, u'&') for w in self.dp.Y_test[i]]) ]] candi = [' '.join(w for w in o)] score = BLEU(candi, refer4bleu, gram=gram) all_score.append(score) return np.mean(all_score)
def run(step, phase, summary_version=True): shuffler = np.random.permutation(FLAGS.batch_size) current_caption_matrix = captions_list[step][shuffler] current_images = images_list[step][shuffler] current_mask_matrix = mask_list[step][shuffler] current_maxlen = maxlen[step] context, sentence, mask, train_op, loss_op, gen_words_op, l, h = operations[ current_maxlen] # current_images : [batch_size, 1, 4096] # current_caption_matrix : [batch_size, n_lstm_steps] # mask : [batch_size, n_lstm_steps] if summary_version: _, loss, words, summary_string = sess.run( [train_op, loss_op, gen_words_op, summary_op], feed_dict={ context: current_images, sentence: current_caption_matrix, mask: current_mask_matrix }) else: _, loss, words, logits, onehot_labels = sess.run( [train_op, loss_op, gen_words_op, l, h], feed_dict={ context: current_images, sentence: current_caption_matrix, mask: current_mask_matrix }) avg_score = 0.0 sentences = [] for (w, c) in zip(words, current_caption_matrix): score, gen_sentence, ref_sentence = \ BLEU.bleu_score(w, c, ix_to_word) avg_score += score sentences.append((gen_sentence, ref_sentence)) avg_score /= len(sentences) if summary_version: return loss, avg_score, sentences, summary_string else: return loss, avg_score, sentences
def __init__(self, is_test_mode, transform=None): self.test_mode = is_test_mode # train_dataset # self.train_max_length_s, self.train_max_length_t, self.train_transform, self.train_dataset, self.train_data_loader = LoadSentenceData(DATA_SET_PATH) # _, _, _, _, self.train_disc_data_loader = LoadSentenceData(DATA_SET_PATH, transform=self.train_transform) # self.test_max_length_s, self.test_max_length_t, self.test_transform, self.test_dataset, self.test_data_loader = LoadSentenceData(TEST_SET_PATH, transform=self.train_transform, _shuffle=False) self.train_max_length_s, self.train_max_length_t, self.train_transform, self.train_dataset, self.train_data_loader = LoadTranslateData( ) _, _, _, _, self.train_disc_data_loader = LoadTranslateData( transform=self.train_transform) self.test_max_length_s, self.test_max_length_t, self.test_transform, self.test_dataset, self.test_data_loader = LoadTranslateData( mode="test", transform=self.train_transform, _shuffle=False) self.train_data_num = len(self.train_dataset) self.test_data_num = 200 # calcurate device self.device = DEVICE # num of vocablary self.vocab_size = len(self.train_transform.w2i) self.emb_vec = LoadEmbVec("data/word2vec/translate_row.vec.pt", self.train_transform, self.vocab_size).to(self.device) self.connect_char_tensor = torch.tensor([ self.train_transform.w2i[CONNECT_SYMBOL] for i in range(BATCH_SIZE) ]).unsqueeze(1).to(self.device) # model self.bce_loss = nn.CrossEntropyLoss(ignore_index=0) self.generator = Generator(self.vocab_size, self.emb_vec) self.blue = BLEU(4) # optimizer self.optimizer_gen = torch.optim.Adam(self.generator.parameters(), lr=START_LEARNING_RATE_G, betas=(0.5, 0.999)) self.one = torch.tensor(1, dtype=torch.float).to(self.device) self.mone = (self.one * -1).to(self.device)
def run(step, phase, summary_version = True): shuffler = np.random.permutation(FLAGS.batch_size) current_caption_matrix = captions_list[step][shuffler] current_images = images_list[step][shuffler] current_mask_matrix = mask_list[step][shuffler] current_maxlen = maxlen[step] context, sentence, mask, train_op, loss_op, gen_words_op, l, h = operations[current_maxlen] # current_images : [batch_size, 1, 4096] # current_caption_matrix : [batch_size, n_lstm_steps] # mask : [batch_size, n_lstm_steps] if summary_version: _, loss, words, summary_string = sess.run( [train_op, loss_op, gen_words_op, summary_op], feed_dict = {context:current_images, sentence:current_caption_matrix, mask:current_mask_matrix}) else: _, loss, words, logits, onehot_labels = sess.run( [train_op, loss_op, gen_words_op, l, h], feed_dict = {context:current_images, sentence:current_caption_matrix, mask:current_mask_matrix}) avg_score = 0.0 sentences = [] for (w, c) in zip(words, current_caption_matrix): score, gen_sentence, ref_sentence = \ BLEU.bleu_score(w, c, ix_to_word) avg_score += score sentences.append((gen_sentence, ref_sentence)) avg_score /= len(sentences) if summary_version: return loss, avg_score, sentences, summary_string else: return loss, avg_score, sentences
hyps = [['i', 'am', 'a', 'boy', 'and', 'test'], ['this', 'is', 'the', 'game', 'of', 'the', 'throne'], [ 'hyp', 'is', 'long', 'and', 'this', 'is', 'the', 'game', 'of', 'the', 'throne' ], ['what', 'the', 'f**k', 'this', 'hmm'], ['the', 'short']] refses = [ ['i', 'am', 'a', 'boy', 'and', 'girl', 'and', 'long'], ['i', 'like', 'this', 'is', 'the', 'game', 'of', 'the', 'throne'], ['this', 'is', 'the', 'game', 'of', 'the', 'throne'], ['what', 'a', 'f*****g', 'serious', '?'], ['too', 'short', 'lang'] ] for hyp, refs in zip(hyps, refses): # stats mine_stats = BLEU.compute_stats(hyp, refs) org_stats = bleu_stats(hyp, refs) assert (mine_stats.flatten().astype(np.int) == org_stats).all() # bleu mine_bleu = BLEU.compute_bleu(mine_stats) org_bleu = bleu(org_stats) #print(mine_bleu, org_bleu) assert mine_bleu == org_bleu # total bleu score org = get_bleu(hyps, refses) bleu = BLEU() bleu.add_corpus(hyps, refses) print("org:", org)
!mkdir -p trained_models/bigan_20 bigan.save('trained_models/bigan_20/') x_gen = bigan.generate() x_gen.shape text ='The Sri Lankan team will play three ODI' translator = bigan.BiGAN params['build_model'] = True gan = cls(**params) gan.generator = load_model(os.path.join(path, "generator.h5")) gan.discriminator = load_model(os.path.join(path, "discriminator.h5")) gan.encoder = load_model(os.path.join(path, "encoder.h5")) gan.bigan_generator = load_model(os.path.join(path, "bigan_generator.h5")) for key, value in destination_language.items(): print(translator.output(text,dest = value).text) import bleu from bleu import BLEU candidate, references = fetch_data(candidate,BGAN) bleu = BLEU(candidate, BGAN.op) print (bleu) out = open('bleu_out.txt', 'w') out.write(str(bleu)) out.close()
hdf = hdf.loc[hdf['LP'] == lp] hdf = hdf.loc[hdf['SYSTEM'] == sys] hdf.reset_index(drop=True, inplace=True) cands = [] fc = open(csdir + '/' + cs, "r", encoding='utf-8') while True: line = fc.readline() if not line: break cands.append(wmt_data_cands(line)) assert len(cands) == len(refs) for i in range(len(cands)): bleu.append(BLEU(refs[i], cands[i], 4)) outlist.append([lp, sys, sum(bleu)/len(bleu), hdf['HUMAN'].item()]) sz = len(cses) pees = [row[2] for row in outlist[-sz:]] hues = [row[3] for row in outlist[-sz:]] lissy = [csdir[-5:]] src = spearmanr(pees, hues) pcc = pearsonr(pees, hues) ktc = kendalltau(pees, hues) lissy += [src.correlation, pcc[0], ktc[0]] finlist.append(lissy)
def pattern_baseline(): v = Vocab() # files for evaluating BLEU pred_path, gold_path = 'candidate.txt', 'reference.txt' pred, gold = open(pred_path, 'w+'), open(gold_path, 'w') ftest = open('../nlpcc-iccpol-2016.kbqa.testing-data', 'r') # separate files for ROUGE # here we use different gold file from seq2seq because the extracted templates from training set # can't cover all the predicates from testing set gold_for_ROUGE = "../run/evaluation/gold_temp/question_" pred_for_ROUGE = "../run/evaluation/pred_temp/question_" # patterns extracted from training set trainAP = open('trainPattern.txt', 'r') rel_dic = {} for line in trainAP: line = line.strip() pattern, rel = line.split('\t')[0], line.split('\t')[-2] if rel not in rel_dic: rel_dic[rel] = [pattern] else: rel_dic[rel].append(pattern) pattern = re.compile(r'[·•\-\s]|(\[[0-9]*\])') cnt = 0 gold_all, pred_all = [], [] for line in ftest: if line.find('<q') == 0: # question line qRaw = line[line.index('>') + 2:].strip() continue elif line.find('<t') == 0: # triple line triple = line[line.index('>') + 2:] s = triple[:triple.index(' |||')].strip() # topic word triNS = triple[triple.index(' |||') + 5:] p = triNS[:triNS.index(' |||')] # predicate p, num = pattern.subn('', p) if p not in rel_dic: with open(pred_for_ROUGE+str(cnt), 'w+') as sw: sw.write('\n') with open(gold_for_ROUGE+str(cnt), 'w+') as sw: sw.write('\n') pred_all.append([]) gold_all.append([]) else: sp = random.sample(rel_dic[p],1)[0] sp = sp.replace('(SUB)', s) pred_list, gold_list = [], [] for char in sp: wid = v.word2id(char) pred_list.append(str(wid)) # replace unk in pred list with 0 pred_all.append(pred_list) pred.write(' '.join(pred_list) + '\n') with open(pred_for_ROUGE + str(cnt), 'w+') as sw: sw.write(' '.join(pred_list) + '\n') for char in qRaw: wid = v.word2id(char) gold_list.append(str(-1 if wid == 0 else wid)) # replace unk in gold list with -1 gold_all.append([gold_list]) gold.write(' '.join(gold_list) + '\n') with open(gold_for_ROUGE + str(cnt), 'w+') as sw: sw.write(' '.join(gold_list) + '\n') cnt += 1 else: continue pred.close() gold.close() print("number of questions in test set: " + str(len(pred_all))) pred_set = [pred_for_ROUGE + str(i) for i in range(cnt)] gold_set = [[gold_for_ROUGE + str(i)] for i in range(cnt)] bleu = BLEU(pred_path, gold_path) print("Bleu: %s" % (str(bleu))) recall, precision, F_measure = PythonROUGE(pred_set, gold_set, ngram_order=4) print("F_measure: %s Recall: %s Precision: %s\n" % (str(F_measure), str(recall), str(precision))) r2g = open('../data/relation2group.txt', 'r') tfidf, cc = 0.0, 0 for line in r2g: items = line.strip().split() if len(items) > 2: cc += 1 tmp = [] for item in items: tmp.append(pred_all[int(item)]) try: sm = SentenceSimilarity(tmp) sm.TfidfModel() tfidf += sm.similarity() except ValueError: pass else: pass print("number of question clusters (under the same predicate): " + str(cc)) tfidf /= cc print("Tf-idf DIVERSE: %s" % str(tfidf))
r2g = open('../data/relation2group.txt', 'r') tfidf, cc = 0.0, 0 for line in r2g: items = line.strip().split() if len(items) > 2: cc += 1 tmp = [] for item in items: tmp.append(pred_all[int(item)]) try: sm = SentenceSimilarity(tmp) sm.TfidfModel() tfidf += sm.similarity() except ValueError: pass else: pass print("number of question clusters (under the same predicate): " + str(cc)) tfidf /= cc print("Tf-idf DIVERSE: %s" % str(tfidf)) if __name__ == '__main__': print('extracting answer patterns from training set ...') getAnswerPatten() get_rel2group() print('done ...') pattern_baseline() print(BLEU('candidate.txt', 'reference.txt'))
def compute(filename): doc = Document(FrequencyTester.PATH + filename + ".txt") gold_doc = Document(FrequencyTester.PATH + filename + "_gold.txt") freqSummary = FrequencyTester.getSummary(doc, len(gold_doc.sentences)) return BLEU.computeNormalize(gold_doc.document, freqSummary, ignore=True)
import numpy as np from pandas import DataFrame from AES_processing import cand_data, ref_data from bleu import BLEU cand_df = pd.read_csv('../data/ASAP_AES/training_set_rel3.tsv', sep='\t', encoding='ISO-8859–1') ref_df = pd.read_csv('../data/ASAP_AES/reference_3_aes.tsv', sep='\t') n_gram = 4 cand_id, candidate_corpus, human_scores = cand_data(cand_df) reference_corpus = ref_data(ref_df) bleu_scores = [] for i in range(len(candidate_corpus)): bleu = BLEU(reference_corpus, candidate_corpus[i], n_gram) # print(bleu) bleu_scores.append(bleu) print(max(bleu_scores)) filename = "../results/asap_aes_results/BLEU_scores_aes.txt" with open(filename, 'w') as f: f.write("candidate_id\tsimilarity\tscore\n") for i in range(len(candidate_corpus)): f.write("{0}\t{1}\t{2}\n".format(cand_id[i], bleu_scores[i], human_scores[i]))
from SAS_processing import data from bleu import BLEU df = pd.read_csv('../data/ASAP_SAS/train.tsv', sep='\t') n_gram = 4 reference_corpus, candidate_corpus, reference_id, candidate_id, candidate_scores, max_scores = data( df) # Check the correctness of length # for i in range(len(reference_corpus)): # print(len(reference_corpus[i]), len(candidate_corpus[i]), len(reference_id[i]), len(candidate_id[i])) # print(reference_id[2], candidate_id[2], candidate_scores[2], max_scores) print(BLEU(reference_corpus[2], reference_corpus[2][50], 4)) # bleu_scores = [] # for i in range(len(reference_corpus)): # bleu = [] # for j in range(len(candidate_corpus[i])): # bleu.append(BLEU(reference_corpus[i], candidate_corpus[i][j], n_gram)) # # print(bleu) # bleu_scores.append(list(bleu)) # # for i in range(len(bleu)): # # print(bleu[i]) # for i in range(len(reference_corpus)): # filename = "../results/BLEU_scores_"+"EssaySet_{}".format(i+1)+".txt" # with open(filename, 'w') as f:
def evaluate_s2s(sess, dataloader, model): global last_best testset = dataloader.test_set pred_list = [] k = 0 with open(pred_path, 'w') as sw1: for x in dataloader.batch_iter(testset, FLAGS.batch_size, False): predictions = model.generate(x, sess) for summary in np.array(predictions): summary = list(summary) if 2 in summary: # 2(START/END) marks the end of generation summary = summary[:summary. index(2)] if summary[0] != 2 else [2] sw1.write(" ".join([str(t) for t in summary]) + '\n') with open(pred_for_ROUGE + str(k), 'w+') as sw2: sw2.write(" ".join([str(t) for t in summary]) + '\n') k += 1 pred_list.append([str(t) for t in summary]) print("Total questions in Test:" + str(k)) # BLEU test bleu = BLEU(pred_path, gold_path) print("Bleu: %s" % (str(bleu))) # ROUGE test pred_set = [pred_for_ROUGE + str(i) for i in range(k)] gold_set = [[gold_for_ROUGE + str(i)] for i in range(k)] recall, precision, F_measure = PythonROUGE(pred_set, gold_set, ngram_order=4) print("F_measure: %s Recall: %s Precision: %s\n" % (str(F_measure), str(recall), str(precision))) r2g = open('../data/relation2group.txt', 'r') tfidf, cc = 0.0, 0 for line in r2g: items = line.strip().split() if len(items) > 2: cc += 1 tmp = [] for item in items: tmp.append(pred_list[int(item)]) try: sm = SentenceSimilarity(tmp) sm.TfidfModel() tfidf += sm.similarity() except ValueError: pass else: pass tfidf /= cc print("number of question clusters (under the same predicate): " + str(cc)) print("Tf-idf DIVERSE: %s" % str(tfidf)) result = "BLEU: %s BLEU_beam: %s \nF_measure: %s Recall: %s Precision: %s\n Tf-idf: %s\n " % \ (str(bleu), str(0), str(F_measure), str(recall), str(precision), str(tfidf)) if float(bleu) > last_best: last_best = float(bleu) to_word(pred_list, save_dir) return result
def evaluate(loader, seq2seq, criterion, max_len): import time losses = utils.AverageMeter() ppls = utils.AverageMeter() seq2seq.eval() bleu = BLEU() tot_st = time.time() bleu_time = 0. # BLEU time: 13k 개에 대해서 약 4s. multi-cpu parallelization 은 가능함. def get_lens(tensor, max_len=max_len): """ get first position (index) of EOS_idx in tensor = length of each sentence tensor: [B, T] """ # assume that former idx coming earlier in nonzero(). # tensor 가 [B, T] 이므로 nonzero 함수도 [i, j] 형태의 tuple 을 결과로 내놓는데, # 이 결과가 i => j 순으로 sorting 되어 있다고 가정. # e.g) nonzero() => [[1,1], [1,2], [2,1], [2,3], [2,5], ...] nz = (tensor == EOS_idx).nonzero() is_first = nz[:-1, 0] != nz[1:, 0] is_first = torch.cat([torch.cuda.ByteTensor([1]), is_first]) # first mask # convert is_first from mask to indice by nonzero() first_nz = nz[is_first.nonzero().flatten()] lens = torch.full([tensor.size(0)], max_len, dtype=torch.long).cuda() lens[first_nz[:, 0]] = first_nz[:, 1] return lens with torch.no_grad(): for i, (src, src_lens, tgt, tgt_lens) in enumerate(loader): B = src.size(0) src = src.cuda() tgt = tgt.cuda() dec_outs, attn_ws = seq2seq(src, src_lens, tgt, tgt_lens, teacher_forcing=0.) loss, ppl = criterion(dec_outs, tgt) losses.update(loss, B) ppls.update(ppl, B) # BLEU bleu_st = time.time() # convert logits to preds preds = dec_outs.max(-1)[1] # get pred lens by finding EOS token pred_lens = get_lens(preds) for pred, target, pred_len, target_len in zip( preds, tgt, pred_lens, tgt_lens): # target_len include EOS token => -1. bleu.add_sentence(pred[:pred_len].cpu().numpy(), target[:target_len - 1].cpu().numpy()) bleu_time += time.time() - bleu_st total_time = time.time() - tot_st logger.debug("TIME: tot = {:.3f}\t bleu = {:.3f}".format( total_time, bleu_time)) return losses.avg, ppls.avg, bleu.score()