def moses_bl_rouge(p, l): bl = bleu.moses_multi_bleu(p, l) x = rouge.rouge(p, l) print( 'Moses BLEU: %f\nROUGE1-F: %f\nROUGE1-P: %f\nROUGE1-R: %f\nROUGE2-F: %f\nROUGE2-P: %f\nROUGE2-R: %f\nROUGEL-F: %f\nROUGEL-P: %f\nROUGEL-R: %f' % (bl, x['rouge_1/f_score'], x['rouge_1/p_score'], x['rouge_1/r_score'], x['rouge_2/f_score'], x['rouge_2/p_score'], x['rouge_2/r_score'], x['rouge_l/f_score'], x['rouge_l/p_score'], x['rouge_l/r_score']))
def bleu_eval(ref_dir, dec_dir): ref_dir = ref_dir + '/' dec_dir = dec_dir + '/' ref = [] dec = [] for i, j in zip(sorted(glob.glob(dec_dir + '*.txt')), sorted(glob.glob(ref_dir + '*.txt'))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) dec.append(dec_tex) bleu_score = bleu.moses_multi_bleu(dec, ref) return bleu_score
def get_metrics(f1,f2): ref = [] decoded = [] count = 0 for i, j in zip(sorted(glob.glob(f1)),sorted(glob.glob(f2))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) decoded.append(dec_tex) count = count + 1 bl = bleu.moses_multi_bleu(decoded,ref) x = rouge.rouge(decoded,ref) s = "\t%.2f\t%.2f\t%.2f\t%.2f"%(bl,x['rouge_1/f_score']*100,x['rouge_2/f_score']*100,x['rouge_l/f_score']*100) print(count) return s
def _bleu_fn(hypotheses, references): # Deal with byte chars if hypotheses.dtype.kind == np.dtype("U"): hypotheses = np.char.encode(hypotheses, "utf-8") if references.dtype.kind == np.dtype("U"): references = np.char.encode(references, "utf-8") # Convert back to unicode object hypotheses = [_.decode("utf-8") for _ in hypotheses] references = [_.decode("utf-8") for _ in references] # Slice all hypotheses and references up to SOS -> EOS sliced_hypotheses = [utils.slice_text( _) for _ in hypotheses] sliced_references = [utils.slice_text( _) for _ in references] bleu_score = bleu.moses_multi_bleu(sliced_hypotheses, sliced_references, lowercase=False) #pylint: disable=E1102 print('bleu_score:', bleu_score) return bleu_score
def get_metrics(f1, f2): ref = [] decoded = [] count = 0 print(f1) print(f2) for i, j in zip(sorted(glob.glob(f1)), sorted(glob.glob(f2))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) decoded.append(dec_tex) count = count + 1 print(len(decoded)) print(len(ref)) x = rouge.rouge(decoded, ref) bl = bleu.moses_multi_bleu(decoded, ref) #replace by pycoco bleu return 0, 0, 0, bl
def calculate_metrics(y_pred, y_true, orig_y_pred=None, verbose=False, bleu=False): ''' Calculate exact match accuracy, precision, recall, F1 score, word-level accuracy y_pred and y_true are lists of strings function returns dict with the calculated metrics ''' N = min(len(y_pred), len(y_true)) # N = 4500 if len(y_pred) != len(y_true): print( 'Warning: The number of predictions and ground truths are not equal, calculating metrics over %d points' % N) # for precision, recall, f1 tp = 0 fp = 0 fn = 0 # for exact match exact_match, exact_match_idx, exact_predicted, good_match_idx, good_match_idx_extended , exact_match_idx_orig = 0, [], [], [], [], [] li_exact_match, li_orig_match, err_idx = [], [], [] # for word-level accuracy correct_words = 0 total_words = 0 if verbose: a = tqdm.tqdm(range(N)) else: a = range(N) for i in a: # print(i) pred = y_pred[i].split() true = y_true[i].split() total_words += len(true) correct_matches = 0 for j in range(min(len(true), len(pred))): if pred[j] == true[j]: correct_words += 1 correct_matches += 1 d_pred, d_true = get_freqs(pred, true) if pred == true: exact_match += 1 if len(pred) > 1 and ('<unk>' not in pred): exact_match_idx.append(i) # exact_predicted.append(pred) # print(pred) if orig_y_pred is not None: orig_pred = orig_y_pred[i].split() orig_d_pred, _ = get_freqs(orig_pred, true) exact_matches, orig_exact_match_cnt = 0, 0 for j in range(min(len(true), len(pred), len(orig_pred))): if true[j] == orig_pred[j] and pred[j] != true[j]: exact_matches += 1 ''' print(orig_pred) print(true) print(pred) err_idx.append(i) print('=====') ''' if true[j] == orig_pred[j]: orig_exact_match_cnt += 1 li_exact_match.append(exact_matches) li_orig_match.append(orig_exact_match_cnt) # print(d_pred, d_true) calc_type = 2 if calc_type == 1: # this is my implementation for word in d_pred: tp += min(d_pred[word], d_true[word]) fp += max(0, d_pred[word] - d_true[word]) fn += max(0, d_true[word] - d_pred[word]) else: # this is the code2seq implementation orig_80, pred_80 = 0, 0 for word in d_pred: if d_pred[word] > 0: if d_true[word] > 0: tp += 1 else: fp += 1 if d_true[word] > 0 and d_pred[word] == 0: fn += 1 if orig_y_pred is not None: for word in orig_d_pred: if orig_d_pred[word] > 0: if word in d_true and d_true[word] > 0: if word in d_pred and d_pred[word] > 0: pred_80 += 1 orig_80 += 1 # if tp > 0.8*len(d_pred) and len(pred) > 1 and ('unk' not in y_pred[i]): # good_match_idx.append(i) # print(tp, fp, fn) precision = tp / (tp + fp + 0.0000000001) recall = tp / (tp + fn + 0.0000000001) f1 = 2 * precision * recall / (precision + recall + 0.0000000001) exact_match /= N word_level_accuracy = correct_words / total_words if sum(li_orig_match) == 0: sum_li = 1 else: sum_li = sum(li_orig_match) asr_dataset = round(sum(li_exact_match) / sum_li * 100, 2) ax = [ e / o if o != 0 else 0 for e, o in zip(li_exact_match, li_orig_match) ] asr_sample_mean = round(sum(ax) / sum_li, 2) asr_sample_std = round(np.std(np.array(ax)), 2) d = { 'precision': precision * 100, 'recall': recall * 100, 'f1': f1 * 100, 'exact_match': exact_match * 100, 'word-level accuracy': word_level_accuracy * 100, 'total_samples': N, 'asr_dataset': asr_dataset, 'asr_sample_mean': asr_sample_mean, 'asr_sample_std': asr_sample_std, 'li_exact_match': li_exact_match, 'li_orig_match': li_orig_match, 'exact_match_idx': exact_match_idx } if bleu: bleu_score = moses_multi_bleu(np.array(y_pred), np.array(y_true)) d['BLEU'] = bleu_score return d
def calculate_metrics(y_pred, y_true, verbose=False, bleu=False): ''' Calculate exact match accuracy, precision, recall, F1 score, word-level accuracy y_pred and y_true are lists of strings function returns dict with the calculated metrics ''' N = min(len(y_pred),len(y_true)) # N = 4500 if len(y_pred)!=len(y_true): print('Warning: The number of predictions and ground truths are not equal, calculating metrics over %d points'%N) # for precision, recall, f1 tp = 0 fp = 0 fn = 0 # for exact match exact_match = 0 # for word-level accuracy correct_words = 0 total_words = 0 if verbose: a = tqdm.tqdm(range(N)) else: a = range(N) for i in a: # print(i) pred = y_pred[i].split() true = y_true[i].split() total_words += len(true) for j in range(min(len(true), len(pred))): if pred[j]==true[j]: correct_words += 1 d_pred, d_true = get_freqs(pred, true) if pred == true: exact_match += 1 # print(d_pred, d_true) calc_type = 2 if calc_type==1: # this is my implementation for word in d_pred: tp += min(d_pred[word], d_true[word]) fp += max(0, d_pred[word]-d_true[word]) fn += max(0, d_true[word]-d_pred[word]) else: # this is the code2seq implementation for word in d_pred: if d_pred[word]>0: if d_true[word]>0: tp += 1 else: fp += 1 if d_true[word]>0 and d_pred[word]==0: fn += 1 # print(tp, fp, fn) precision = tp / (tp+fp+0.0000000001) recall = tp / (tp+fn+0.0000000001) f1 = 2*precision*recall / (precision+recall+0.0000000001) exact_match /= N word_level_accuracy = correct_words / total_words d = { 'precision': precision*100, 'recall': recall*100, 'f1': f1*100, 'exact_match':exact_match*100, 'word-level accuracy': word_level_accuracy*100, } if bleu: bleu_score = moses_multi_bleu(np.array(y_pred), np.array(y_true)) d['BLEU'] = bleu_score return d
import sys import glob import rouge import bleu import pandas as pd f1 = sys.argv[1] #decoded f2 = sys.argv[2] #reference ref = [] decoded = [] for i, j in zip(sorted(glob.glob(f1 + '*.txt')), sorted(glob.glob(f2 + '*.txt'))): ref_tex = '' dec_tex = '' for k in open(i).readlines(): dec_tex = dec_tex + k.strip() for l in open(j).readlines(): ref_tex = ref_tex + l.strip() ref.append(ref_tex) decoded.append(dec_tex) data = {'decoded': decoded, 'reference': ref} df = pd.DataFrame(data) df.to_csv('analysis.csv', index=False) bl = bleu.moses_multi_bleu(decoded, ref) x = rouge.rouge(decoded, ref) print('%.2f\t%.2f\t%.2f\t%.2f' % (bl, x['rouge_1/f_score'] * 100, x['rouge_2/f_score'] * 100, x['rouge_l/f_score'] * 100))
def _train(epoch: int, enc: nn.Module, dec: nn.Module, disc: nn.Module, prior_size: int, dl: Iterator, vocab: Vocab, device: str, validate: bool = False) -> Tuple[float, float, float, float]: if not validate: enc.train() dec.train() disc.train() else: enc.eval() dec.eval() disc.eval() epoch_g_loss = 0.0 epoch_ae_loss = 0.0 epoch_disc_loss = 0.0 strs = [] dec_strs = [] n_batches = len(dl) for batch_idx, batch in enumerate(dl): seq = batch.text seq = seq[1:] label = batch.label label = to_onehot(label, 2, device) (seq_len, batch_size) = seq.shape batch_zeros = torch.zeros((batch_size, 1)).to(device) batch_ones = torch.ones((batch_size, 1)).to(device) # ======== train/validate Discriminator ======== if not validate: enc.zero_grad() disc.zero_grad() z = torch.randn((batch_size, prior_size)).to(device) z_label = to_onehot( torch.randint(0, 2, (batch_size, )).long(), 2, device) latent = enc(seq) fake_pred = disc(latent, label) true_pred = disc(z, z_label) fake_loss = F.binary_cross_entropy_with_logits(fake_pred, batch_zeros) true_loss = F.binary_cross_entropy_with_logits(true_pred, batch_ones) disc_loss = 0.5 * (fake_loss + true_loss) if not validate: disc_loss.backward() disc.optim.step() # ======== train/validate Autoencoder ======== if not validate: enc.zero_grad() dec.zero_grad() disc.zero_grad() latent = enc(seq) x = torch.zeros(1, batch_size).to(device).long() + vocab.stoi['<sos>'] h = None output = None for i in range(seq_len): o, h = dec(x, latent, h, label) x = seq[i].view(1, -1) output = o if output is None else torch.cat((output, o), 0) ae_loss = F.nll_loss(output, seq.view(-1)) fake_pred_z = disc(latent, label) enc_loss = F.binary_cross_entropy_with_logits(fake_pred_z, batch_ones) g_loss = ae_loss + enc_loss if not validate: g_loss.backward() dec.optim.step() enc.optim.step() # ---------------------------------------------------- epoch_g_loss += g_loss.item() epoch_ae_loss += ae_loss.item() epoch_disc_loss += disc_loss.item() _, w_idxs = output.topk(1, dim=1) dec_seq = w_idxs.view(seq_len, batch_size) strs.extend(seq_to_str(seq.detach(), vocab)) dec_strs.extend(seq_to_str(dec_seq.detach(), vocab)) epoch_g_loss /= n_batches epoch_ae_loss /= n_batches epoch_disc_loss /= n_batches bleu = moses_multi_bleu(np.array(dec_strs), np.array(strs)) mode = 'Valid' if validate else 'Train' print( "Epoch {:3} {:5}: BLEU: {:.2f}, AE: {:.5f}, G: {:.5f}, D: {:.5f} at {}" .format(epoch, mode, bleu, epoch_ae_loss, epoch_g_loss, epoch_disc_loss, datetime.now().strftime("%H:%M:%S"))) return epoch_ae_loss, epoch_g_loss, epoch_disc_loss, bleu
def optain_all_data(): main_folder = './result_data/' # Obtain all folders folders = [ f for f in os.listdir(main_folder) if f != '__pycache__' and os.path.isdir(os.path.join(main_folder, f)) ] # Process each checkpoint in the folders epochs_data = [] for folder in folders: print('folder:{}'.format(folder)) input_fname = os.path.join('../data/tokenized_target.txt') sorted_fname_responses = sort_filenames_on_epoch( os.path.join(main_folder, folder), 'response_str') epoch_data = [] for i in range(len(sorted_fname_responses)): response_fname = sorted_fname_responses[i] if response_fname == None: epoch_data.append((-1, -1, -1)) continue ref_tex = [] dec_tex = [] for k in open(input_fname).readlines(): sentence = k.strip() sentence = sentence.replace("<bos> ", "").replace(" <eos>", "") dec_tex.append(sentence) for l in open(response_fname).readlines(): sentence = l.strip() sentence = sentence.replace("<bos> ", "").replace(" <eos>", "") ref_tex.append(sentence) # Bleu print("\nBleu score...") bl = bleu.moses_multi_bleu(dec_tex, ref_tex) print(bl) # Rouge 1 print("\nRouge 1 score...") r1_f1_score, r1_precision, r1_recall = rouge.rouge_n( dec_tex, ref_tex, 1) print(r1_f1_score * 100) #, precision, recall) # Rouge 2 print("\nRouge 2 score...") r2_f1_score, r2_precision, r2_recall = rouge.rouge_n( dec_tex, ref_tex, 2) print(r2_f1_score * 100) #, precision, recall) # # Rouge l # print("\nCalculating the rouge l score...") # f1_score, precision, recall = rouge.rouge_l_sentence_level(dec_tex, ref_tex) # print(f1_score*100)#, precision, recall) epoch_data.append((bl, r1_f1_score * 100, r2_f1_score * 100)) epochs_data.append((folder, epoch_data)) return epochs_data