def write_summary(tar_real, predictions, step, write=config.write_summary_op): ref_sents=[] hyp_sents=[] for tar, ref_hyp in zip(tar_real, predictions): sum_ref = tokenizer.convert_ids_to_tokens([i for i in tf.squeeze(tar) if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([i for i in tf.squeeze(ref_hyp) if i not in [0, 101, 102]]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) ref_sents.append(sum_ref) hyp_sents.append(sum_hyp) try: rouges = rouge_all.get_scores(ref_sents , hyp_sents) avg_rouge_f1 = np.mean([np.mean([rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"]]) for rouge_scores in rouges]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: avg_rouge_f1 = 0 avg_bert_f1 = 0 if write and (step)%config.write_per_step == 0: with tf.io.gfile.GFile(file_path.summary_write_path+str(step.numpy()), 'w') as f: for ref, hyp in zip(ref_sents, hyp_sents): f.write(ref+'\t'+hyp+'\n') return (avg_rouge_f1, avg_bert_f1)
def write_output_sequence(tar_real, predictions, step, write_output_seq): ref_sents = [] hyp_sents = [] rouge_all = Rouge() for tar, ref_hyp in zip(tar_real, predictions): detokenized_refs, detokenized_hyp_sents = detokenize( target_tokenizer, tf.squeeze(tar), tf.squeeze(ref_hyp)) ref_sents.append(detokenized_refs) hyp_sents.append(detokenized_hyp_sents) try: rouges = rouge_all.get_scores(ref_sents, hyp_sents) avg_rouge_f1 = np.mean([ np.mean([ rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"] ]) for rouge_scores in rouges ]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, model_type=config.bert_score_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: log.warning( 'Some problem while calculating ROUGE so setting ROUGE score to zero' ) avg_rouge_f1 = 0 avg_bert_f1 = 0 if write_output_seq: with tf.io.gfile.GFile( config.output_sequence_write_path + str(step.numpy()), 'w') as f: for ref, hyp in zip(ref_sents, hyp_sents): f.write(ref + '\t' + hyp + '\n') return (avg_rouge_f1, avg_bert_f1)
def __init__(self, true_output_sequences, predicted_output_sequences): self.ref_sents = true_output_sequences self.hyp_sents = predicted_output_sequences self.calculate_rouge = Rouge() _ = b_score(["I'm Batman"], ["I'm Spiderman"], lang='en', model_type=config.target_pretrained_bert_model) log.info('Loaded Pre-trained BERT for BERT-SCORE calculation')
def calculate_bert_score(): try: _, _, bert_f1 = b_score(self.references, self.hypothesis_output, model_type=config.bert_score_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: log.warning( 'Some problem while calculating BERT_F1 score so setting it to zero' ) avg_bert_f1 = 0 return avg_bert_f1
def run_inference(dataset, print_output=False): for draft_type, refine_type in draft_and_refine_decoder_combinations: ref_sents = [] hyp_sents = [] for (doc_id, (input_ids, _, _, target_ids, _, _)) in enumerate(dataset, 1): start_time = time.time() if draft_type != 'beam_search': _, _, refined_summary, _ = predict_using_sampling(input_ids, draft_type, refine_type, k=10) else: _, refined_summary, _ = predict_using_beam_search( input_ids, refine_decoder_sampling_type=refine_type) sum_ref = tokenizer.convert_ids_to_tokens( [i for i in tf.squeeze(target_ids) if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([ i for i in tf.squeeze(refined_summary) if i not in [0, 101, 102] ]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) if print_output: print('Original summary: {}'.format(sum_ref)) print('Predicted summary: {}'.format(sum_hyp)) ref_sents.append(sum_ref) hyp_sents.append(sum_hyp) print( f'Calculating scores for {len(ref_sents)} golden summaries and {len(hyp_sents)} predicted summaries' ) try: rouges = rouge_all.get_scores(ref_sents, hyp_sents) avg_rouge_f1 = np.mean([ np.mean([ rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"] ]) for rouge_scores in rouges ]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: avg_rouge_f1 = 0 avg_bert_f1 = 0 print( infer_template.format(draft_type, refine_type, avg_rouge_f1, avg_bert_f1)) print(f'time to process document {doc_id} : {time.time()-start_time}')
def run_eval(ckpt_path='/content/drive/My Drive/Text_summarization/BERT_text_summarisation/cnn_checkpoints/ckpt-69'): restore_chkpt(ckpt_path) if config.use_tfds: examples, metadata = tfds.load( config.tfds_name, with_info=True, as_supervised=True, data_dir='/content/drive/My Drive/Text_summarization/cnn_dataset', builder_kwargs={"version": "2.0.0"} ) test_examples = examples['test'] test_buffer_size = metadata.splits['test'].num_examples test_dataset = map_batch_shuffle( test_examples, test_buffer_size, split='test', batch_size=h_parms.batch_size ) log.info('Test TF_dataset created') test_dataset = test_dataset.take(1) else: test_dataset = infer_data_from_df() ref_sents=[] hyp_sents=[] for (doc_id, (input_ids, _, _, target_ids, _, _)) in tqdm(enumerate(test_dataset, 1)): start_time = time.time() draft, refined_summary, att = predict_using_beam_search( input_ids, beam_size=3, refine_decoder_type='greedy' ) for tar, ref_hyp in zip(target_ids, refined_summary): sum_ref = tokenizer.convert_ids_to_tokens([i for i in tf.squeeze(tar) if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([i for i in tf.squeeze(ref_hyp) if i not in [0, 101, 102]]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) ref_sents.append(sum_ref) hyp_sents.append(sum_hyp) try: rouges = rouge_all.get_scores(ref_sents , hyp_sents) avg_rouge_f1 = np.mean([np.mean([rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"]]) for rouge_scores in rouges]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: avg_rouge_f1 = 0 avg_bert_f1 = 0 print(infer_template.format('beam_search', 'greedy', avg_rouge_f1, avg_bert_f1, 3)) print(f'time to process document {doc_id} : {time.time()-start_time}') print(f'Calculating scores for {len(ref_sents)} golden summaries and {len(hyp_sents)} predicted summaries')
def run_inference(model, dataset, beam_sizes_to_try=config.beam_sizes): for beam_size in beam_sizes_to_try: ref_sents = [] hyp_sents = [] for (doc_id, (input_ids, _, _, target_ids, _, _)) in enumerate(dataset, 1): start_time = time.time() # translated_output_temp[0] (batch, beam_size, summ_length+1) translated_output_temp, enc_output = draft_decoded_summary( model, input_ids, target_ids[:, :-1], beam_size) draft_predictions = translated_output_temp[0][:, 0, :] _, _, dec_padding_mask = create_masks(input_ids, target_ids[:, :-1]) refined_summary, attention_dists = refined_summary_greedy( model, input_ids, enc_output, draft_predictions, dec_padding_mask, training=False) sum_ref = tokenizer.convert_ids_to_tokens( [i for i in tf.squeeze(target_ids) if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([ i for i in tf.squeeze(refined_summary) if i not in [0, 101, 102] ]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) print('Original summary: {}'.format(sum_ref)) print('Predicted summary: {}'.format(sum_hyp)) if sum_ref and sum_hyp: ref_sents.append(sum_ref) hyp_sents.append(sum_hyp) try: rouges = rouge_all.get_scores(ref_sents, hyp_sents) avg_rouge_f1 = np.mean([ np.mean([ rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"] ]) for rouge_scores in rouges ]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: avg_rouge_f1 = 0 avg_bert_f1 = 0 print(infer_template.format(beam_size, avg_rouge_f1, avg_bert_f1)) print(f'time to process document {doc_id} : {time.time()-start_time}')
def evaluate_bert_score(self): try: _, _, bert_f1 = b_score(self.ref_sents, self.hyp_sents, model_type=config.bert_score_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: log.warning( 'Some problem while calculating BERT score so setting it to zero' ) avg_bert_f1 = 0 return avg_bert_f1
def run_inference(dataset, beam_sizes_to_try=h_parms.beam_sizes): for beam_size in beam_sizes_to_try: total_summary = [] for (doc_id, (document, summary)) in enumerate(dataset, 1): start_time = time.time() # translated_output_temp[0] (batch, beam_size, summ_length+1) translated_output_temp = beam_search_eval(document, beam_size) draft_predictions = translated_output_temp[0][:, 0, :] sum_ref = tokenizer.decode( [j for j in tf.squeeze(summary) if j < tokenizer.vocab_size]) sum_hyp = tokenizer.decode([ j for j in tf.squeeze(translated_output_temp[0][:, 0, :]) if j < tokenizer.vocab_size ]) total_summary.append((sum_ref, sum_hyp)) print('Original summary: {}'.format( tokenizer.decode([ j for j in tf.squeeze(summary) if j < tokenizer.vocab_size ]))) print('Predicted summary: {}'.format( tokenizer.decode([ j for j in tf.squeeze(translated_output_temp[0][:, 0, :]) if j < tokenizer.vocab_size ]))) ref_sents = [ref for ref, _ in total_summary] hyp_sents = [hyp for _, hyp in total_summary] rouges = rouge_all.get_scores(ref_sents, hyp_sents) avg_rouge_f1 = np.mean([ np.mean([ rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"] ]) for rouge_scores in rouges ]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type='bert-base-uncased') print( infer_template.format(beam_size, avg_rouge_f1, np.mean(bert_f1.numpy()))) print(f'time to process document {doc_id} : {time.time()-start_time}')
def write_summary(tar_real, predictions, step, write=config.write_summary_op): r_avg_final = [] total_summary = [] for i, sub_tar_real in enumerate(tar_real): predicted_id = tf.cast(tf.argmax(predictions[i], axis=-1), tf.int32) #decoded_logits = argmax(predictions[i]) #predicted_id = tf.cast(decoded_logits, tf.int32) sum_ref = tokenizer.convert_ids_to_tokens([i for i in sub_tar_real.numpy() if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([i for i in predicted_id.numpy() if i not in [0, 101, 102]]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) # don't consider empty values for ROUGE and BERT score calculation if sum_hyp and sum_ref: total_summary.append((sum_ref, sum_hyp)) ref_sents = [ref for ref, _ in total_summary] hyp_sents = [hyp for _, hyp in total_summary] # returns :- dict of dicts if ref_sents and hyp_sents: try: rouges = rouge_all.get_scores(ref_sents , hyp_sents) avg_rouge_f1 = np.mean([np.mean([rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"]]) for rouge_scores in rouges]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) rouge_score = avg_rouge_f1.astype('float64') bert_f1_score = np.mean(bert_f1.tolist(), dtype=np.float64) except ValueError: log.warning('Problem in calculating the ROUGE scores') rouge_score = 0 bert_f1_score = 0 else: log.warning('The sentences predicted by the model are empty so setting the scores to 0') rouge_score = 0 bert_f1_score = 0 if write and (step)%config.write_per_step == 0: with tf.io.gfile.GFile(file_path.summary_write_path+str(step.numpy()), 'w') as f: for ref, hyp in total_summary: f.write(ref+'\t'+hyp+'\n') return (rouge_score, bert_f1_score)
_, _, refined_summary, _ = predict_using_sampling( input_ids, draft_decoder_type=draft_dec_type, refine_decoder_type=refine_dec_type, k=k, p=p, temperature=temperature ) for tar, ref_hyp in zip(target_ids, refined_summary): sum_ref = tokenizer.convert_ids_to_tokens([i for i in tf.squeeze(tar) if i not in [0, 101, 102]]) sum_hyp = tokenizer.convert_ids_to_tokens([i for i in tf.squeeze(ref_hyp) if i not in [0, 101, 102]]) sum_ref = convert_wordpiece_to_words(sum_ref) sum_hyp = convert_wordpiece_to_words(sum_hyp) #print('Original summary: {}'.format(sum_ref)) #print('Predicted summary: {}'.format(sum_hyp)) ref_sents.append(sum_ref) hyp_sents.append(sum_hyp) try: rouges = rouge_all.get_scores(ref_sents , hyp_sents) avg_rouge_f1 = np.mean([np.mean([rouge_scores['rouge-1']["f"], rouge_scores['rouge-2']["f"], rouge_scores['rouge-l']["f"]]) for rouge_scores in rouges]) _, _, bert_f1 = b_score(ref_sents, hyp_sents, lang='en', model_type=config.pretrained_bert_model) avg_bert_f1 = np.mean(bert_f1.numpy()) except: avg_rouge_f1 = 0 avg_bert_f1 = 0 print(infer_template.format(draft_dec_type, refine_dec_type, avg_rouge_f1, avg_bert_f1, p, k, temperature, beam_size)) print(f'time to process document {doc_id} : {time.time()-start_time}') print(f'Calculating scores for {len(ref_sents)} golden summaries and {len(hyp_sents)} predicted summaries')
# -*- coding: utf-8 -*- import tensorflow as tf import numpy as np import shutil import os from configuration import config from hyper_parameters import h_parms from rouge import Rouge from input_path import file_path from create_tokenizer import tokenizer from bert_score import score as b_score from creates import log, monitor_metrics log.info('Loading Pre-trained BERT model for BERT SCORE calculation') _, _, _ = b_score(["I'm Batman"], ["I'm Spiderman"], lang='en', model_type=config.pretrained_bert_model) rouge_all = Rouge() class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule): def __init__(self, d_model, warmup_steps=4000): super(CustomSchedule, self).__init__() self.d_model = d_model self.d_model = tf.cast(self.d_model, tf.float32) self.warmup_steps = warmup_steps def __call__(self, step): arg1 = tf.math.rsqrt(step) arg2 = step * (self.warmup_steps ** -1.5) return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)