def find_rouge(file1,file2): with open(file1, 'r') as myfile: text1=myfile.read() with open(file2, 'r') as myfile: text2=myfile.read() rouge = Rouge() scores = rouge.get_scores(text1, text2) print scores return scores
def decode(self): start = time.time() counter = 0 batch_generator = self.dataset.batches while True: try: batch = next(batch_generator) best_summary = self.beam_search(batch) # Run beam search to get best Hypothesis # Extract the output ids from the hypothesis and convert back to words output_ids = [int(t) for t in best_summary.tokens[1:]] decoded_words = self.dataset.vocab.outputids2words(output_ids, (batch.art_oovs[0] if self.args.pointer_gen else None)) # Remove the [STOP] token from decoded_words, if necessary try: fst_stop_idx = decoded_words.index(opt.EOS) decoded_words = decoded_words[:fst_stop_idx] except ValueError: decoded_words = decoded_words hypothesis = ' '.join(decoded_words) self.hypotheses.append(hypothesis) self.references.append(batch.original_abstracts[0]) # single_pass counter += 1 if counter % 10 == 0: print('Beam Search %d example in %d sec' % (counter, time.time() - start)) start = time.time() except StopIteration: print('StopIteration, Beam Search end. Writing to file:', self._rouge_ref_dir) break self.write_for_rouge() rouge = Rouge() scores = rouge.get_scores(self.references, self.hypotheses, avg=True) return scores
cit_text = str(citing_sentences.loc[citing_sentences['Article_ID']== art_id,"Clean_text"].values[:num_cits_to_use]) ARTICLE_TO_SUMMARIZE = abs_text+" "+cit_text inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt') # Generate Summary summary_ids = model.generate(inputs['input_ids'], num_beams=4,min_length= 100,max_length=200, early_stopping=True , truncation=True) final_sum = ([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids]) sum_txt = ''.join(final_sum) summary_aa = summary_aa.append({'Article_ID':art_id, 'ProducedSummary':sum_txt, 'Length':len(sum_txt)},ignore_index=True) summary_aa.to_csv("G:/My Drive/Thesis/output/symmaryaa.csv",index=False) ######################################################################################################### ## Evaluation Part from rouge import Rouge rouge = Rouge() ref = "Cascaded Grammatical Relation Assignment In this paper we discuss cascaded Memory-Based grammatical relations assignment. In the first stages of the cascade, we find chunks of several types (NP,VP,ADJP,ADVP,PP) and label them with their adverbial function (e.g. local, temporal). In the last stage, we assign grammatical relations to pairs of chunks. We studied the effect of adding several levels to this cascaded classifier and we found that even the less performing chunkers enhanced the performance of the relation finder. We achieve 71.2 F-score for grammatical relation assignment on automatically tagged and chunked text after training on about 40,000 Wall Street Journal sentences. " scores = rouge.get_scores(summary_1.iloc[946,1], ref) scores[0].get('rouge-1').get('f') scores[0].get('rouge-2').get('f') ## Eval - sum 1a result_1a = pd.DataFrame(columns=['Article_ID','R1-F' , 'R1-P','R1-r','R2-F','R2-P','R2-r','Rl-F','Rl-p','Rl-r']) for i in range(len(abstract)): art_id = summary_1a.iloc[i,0] title = abstract.loc[abstract['Article_ID']==art_id,'Title'].values[0] generated_summary = title +" "+summary_1a.iloc[i,1] gs = gold_summary.loc[gold_summary['Article_ID']==art_id,'GoldSummary'].values[0] scores = rouge.get_scores(generated_summary, gs) result_1a = result_1a.append({'Article_ID':art_id,'R1-F': scores[0].get('rouge-1').get('f'), 'R1-P': scores[0].get('rouge-1').get('p'),'R1-r': scores[0].get('rouge-1').get('r'),'R2-F': scores[0].get('rouge-2').get('f'),'R2-P': scores[0].get('rouge-2').get('p'),'R2-r': scores[0].get('rouge-2').get('r'),'Rl-F': scores[0].get('rouge-l').get('f'),'Rl-p': scores[0].get('rouge-l').get('p'),'Rl-r': scores[0].get('rouge-l').get('r')},ignore_index=True) result_1a.to_csv("G:/My Drive/Thesis/output/result_1a.csv",index=False)
# 随机输出一条预测结果 ''' batch_eval_size = batch_eval_y.size(0) sample_index = random.randint(0,batch_eval_size-1) true_words = tokenizer.convert_ids_to_tokens(batch_eval_y[sample_index].tolist()) predict_words = tokenizer.convert_ids_to_tokens(eval_outputs[sample_index]) print('True: ' + ''.join(true_words)) print('Predict: ' + ''.join(predict_words)) print() ''' # 批量评估 # eval_outputs转换后的格式: ['id id id id','id id id',....],需要去除填充位PAD(id=0)和终止符_EOS(id=2) # batch_eval_y转换后的格式: ['id id id id','id id id',....],需要去除填充位PAD(id=0)和终止符_EOS(id=2) eval_outputs, batch_eval_y = convert_to_RougePattern( eval_outputs, batch_eval_y) rouge_score = rouge.get_scores(eval_outputs, batch_eval_y) # 获取ROUGE-1、ROUGE-2、ROUGE-L for i in range(len(eval_outputs)): batch_eval_rouge1.append( rouge_score[i]['rouge-1']['r']) batch_eval_rouge2.append( rouge_score[i]['rouge-2']['r']) batch_eval_rougeL.append( rouge_score[i]['rouge-l']['r']) # 计算ROUGE各指标的平均值 num_data = len(batch_eval_rouge1) batch_eval_rouge1 = sum(batch_eval_rouge1) * 100 / num_data batch_eval_rouge2 = sum(batch_eval_rouge2) * 100 / num_data batch_eval_rougeL = sum(batch_eval_rougeL) * 100 / num_data # 输出当前step,评估集的ROUGE指标 line = 'Epoch: %3d' % (epoch + 1) + '\t| Step: %5d' % step + '\t| ROUGE-1: %10.2f' % batch_eval_rouge1 \
actual_files = sorted(glob.glob(BASE_DIR + "reference/*.txt")) for name in actual_files: with open(name) as f: data = f.read().replace('\n', '') actual_abs.append(data) num_docs_using = len(generated_abs) val_rouge_f = {'rouge-1': 0, 'rouge-2': 0, 'rouge-l': 0} val_rouge_p = {'rouge-1': 0, 'rouge-2': 0, 'rouge-l': 0} val_rouge_r = {'rouge-1': 0, 'rouge-2': 0, 'rouge-l': 0} for i in range(num_docs_using): generated = generated_abs[i] reference = actual_abs[i] rouge_scores = rouge.get_scores(generated, reference)[0] for r in ['rouge-1', 'rouge-2', 'rouge-l']: val_rouge_f[r] += rouge_scores[r]['f'] val_rouge_p[r] += rouge_scores[r]['p'] val_rouge_r[r] += rouge_scores[r]['r'] for i in val_rouge_f: val_rouge_f[i] /= num_docs_using val_rouge_p[i] /= num_docs_using val_rouge_r[i] /= num_docs_using val_rouge_f[i] *= 100 val_rouge_p[i] *= 100 val_rouge_r[i] *= 100 print("Precision:", val_rouge_p) print("Recall:", val_rouge_r)
def compute_metrics_from_files(p_path_to_reference_file, p_path_to_candidate_file): """Compute BLEU-N and ROUGE-L metrics. IMPORTANT: No-answer reference will be excluded from calculation. Args: p_path_to_reference_file (str): path to reference file. p_path_to_candidate_file (str): path to candidate file. Both files should be in format: {QUERY_ID_JSON_ID: <a_query_id_int>, ANSWERS_JSON_ID: [<list_of_answers_string>]} Returns: dict: dictionary of {'bleu_n': <bleu_n score>, 'rouge_l': <rouge_l score>} """ reference_dictionary, reference_no_answer_query_ids, reference_yes_answer_query_ids = \ load_file(p_path_to_reference_file) candidate_dictionary, candidate_no_answer_query_ids, candidate_yes_answer_query_ids = load_file(p_path_to_candidate_file) #Calculate Accuracy of dealing with No Answer Present true_positives = len(candidate_yes_answer_query_ids.intersection(reference_yes_answer_query_ids)) false_negatives = len(reference_yes_answer_query_ids)-true_positives true_negatives = len(candidate_no_answer_query_ids.intersection(reference_no_answer_query_ids)) false_positives = len(reference_no_answer_query_ids)-true_negatives precision = float(true_positives)/(true_positives+false_positives) if (true_positives+false_positives)>0 else 1. recall = float(true_positives)/(true_positives+false_negatives) if (true_positives+false_negatives)>0 else 1. F1 = 2 *((precision*recall)/(precision+recall)) for query_id, answers in candidate_dictionary.items(): assert \ len(answers) <= 1, \ 'query_id %d contains more than 1 answer \"%s\" in candidate file' % \ (query_id, str(answers)) reference_query_ids = set(reference_dictionary.keys()) candidate_query_ids = set(candidate_dictionary.keys()) common_query_ids = reference_query_ids.intersection(candidate_query_ids) assert (len(common_query_ids) == len(reference_query_ids)) and \ (len(common_query_ids) == len(candidate_query_ids)), \ 'Reference and candidate files must share same query ids' semantic_similarity = 0 bleu = [0,0,0,0] rouge_score = 0 rouge = Rouge() smoothie = SmoothingFunction().method0 for key in reference_dictionary: candidate_answer = remove_punctuation(candidate_dictionary[key][0]) #nlp_candidate_answer = nlp(candidate_answer) reference_answers = reference_dictionary[key] candidate_values = [0,0,0,0,0,0] selected_values = [0,0,0,0,0,0] for reference_answer in reference_answers: if candidate_answer != ' ': reference_answer = remove_punctuation(reference_answer) if reference_answer == "no answer present": #if no answer is possible assign 1 if no answer was provided and 0 if an answer was provided if candidate_answer == reference_answer: for i in range(0,6): selected_values[i] += 1 else: reference_split = reference_answer.split(',') #candidate_values[0] = nlp_candidate_answer.similarity(nlp(reference_answer)) candidate_values[0] = 0 candidate_values[1] = rouge.get_scores(candidate_answer, reference_answer)[0]['rouge-l']['f'] candidate_values[2] = sentence_bleu(reference_answer, candidate_answer, weights=(1, 0, 0, 0), smoothing_function=smoothie) candidate_values[3] = sentence_bleu(reference_answer, candidate_answer, weights=(0.5,0.5,0,0), smoothing_function=smoothie) candidate_values[4] = sentence_bleu(reference_answer, candidate_answer, weights=(1/3.0,1/3.0,1/3.0,0), smoothing_function=smoothie) candidate_values[5] = sentence_bleu(reference_answer, candidate_answer, weights=(0.25,0.25,0.25,0.25), smoothing_function=smoothie) #partial credit for yes/no when complete answer is a yes/no question if (candidate_answer == 'yes' and reference_answer[0:3] == candidate_answer) or (candidate_answer == 'no'and reference_answer[0:2] == candidate_answer): for i in range(0,6): selected_values[i] += max(candidate_values[i], YES_NO_DISCOUNT_RATE) else: for i in range(0,6): selected_values[i] += candidate_values[i] semantic_similarity += (selected_values[0]/len(reference_answers)) rouge_score += (selected_values[1]/len(reference_answers)) for i in range (0,4): bleu[i] += (selected_values[i+2]/len(reference_answers)) all_scores = {} all_scores['F1'] = F1 all_scores['Precision'] = precision all_scores['Recall'] = recall all_scores['Accuracy'] = (true_positives + true_negatives)/(true_positives + true_negatives + false_positives + false_negatives) #all_scores['Semantic_Similarity'] = (semantic_similarity/len(reference_dictionary)) all_scores['rouge_l'] = (rouge_score/len(reference_dictionary)) for i in range(0,4): all_scores['bleu_%d' % (i+1)] = (bleu[i]/len(reference_dictionary)) return all_scores
def evaluate_generation(self, IntraGRU, InterGRU, DecoderModel, Epoch, concat_rating=False, write_origin=False, write_insert_sql=False, _use_coverage=False, _write_mode='evaluate', visulize_attn_epoch=0): EngStopWords = set(stopwords.words('english')) group_loss = 0 decoder_epoch_loss = 0 AttnVisualize = Visualization(self.save_dir, visulize_attn_epoch, self.num_of_reviews) rouge = Rouge() average_rouge_score = { 'rouge-1': { 'f': 0.0, 'p': 0.0, 'r': 0.0 }, 'rouge-2': { 'f': 0.0, 'p': 0.0, 'r': 0.0 }, 'rouge-l': { 'f': 0.0, 'p': 0.0, 'r': 0.0 } } average_bleu_score = { 'bleuScore-1': 0.0, 'bleuScore-2': 0.0, 'bleuScore-3': 0.0, 'bleuScore-4': 0.0 } def _get_onehot_rating(r): _encode_rating = self._rating_to_onehot(r) _encode_rating = torch.tensor(_encode_rating).to(self.device) return _encode_rating.unsqueeze(0) for batch_ctr in tqdm.tqdm(range(len( self.testing_batches[0]))): #how many batches for idx in range(len(self.testing_batch_labels)): for reviews_ctr in range(len( self.testing_batches)): # iter. through reviews word_batchs, lengths, ratings = self.testing_batches[ reviews_ctr][batch_ctr] word_batchs = word_batchs.to(self.device) lengths = lengths.to(self.device) current_asins = torch.tensor( self.testing_asins[idx][batch_ctr]).to(self.device) current_reviewerIDs = torch.tensor( self.testing_reviewerIDs[idx][batch_ctr]).to( self.device) with torch.no_grad(): s_j, intra_hidden, intra_attn = IntraGRU[reviews_ctr]( word_batchs, lengths, current_asins, current_reviewerIDs) s_j = s_j.unsqueeze(0) # Reviewer inf. for print. _reviewer = self.testing_external_memorys[reviews_ctr][ batch_ctr] _reviewer = torch.tensor([val for val in _reviewer ]).to(self.device) _reviewer = _reviewer.unsqueeze(0) _reviewer_cat = torch.cat( (_reviewer_cat, _reviewer), 0) if reviews_ctr > 0 else _reviewer # concatenate reviews' rating _encode_rating = _get_onehot_rating( self.testing_review_rating[reviews_ctr][batch_ctr] ) if concat_rating else None # encode rating # concatenate intra-reviews' review representation. if (reviews_ctr == 0): s_seqence = s_j r_seqence = None # initialize input rating r_seqence = _encode_rating if concat_rating else None else: s_seqence = torch.cat((s_seqence, s_j), 0) r_seqence = torch.cat( (r_seqence, _encode_rating), 0) if concat_rating else None pass # Writing Intra-attention weight to .html file if (_write_mode == 'attention'): for index_, candidateObj_ in enumerate(current_asins): intra_attn_wts = intra_attn[:, index_].squeeze( 1).tolist() word_indexes = word_batchs[:, index_].tolist() sentence, weights = AttnVisualize.wdIndex2sentences( word_indexes, self.voc.index2word, intra_attn_wts) new_weights = [ float(wts / sum(weights[0])) for wts in weights[0] ] for w_index, word in enumerate( sentence[0].split()): if (word in EngStopWords): new_weights[ w_index] = new_weights[w_index] * 0.001 if (new_weights[w_index] < 0.0001): new_weights[w_index] = 0 AttnVisualize.createHTML( sentence, [new_weights], reviews_ctr, fname='{}@{}'.format( self.itemObj.index2asin[ candidateObj_.item()], reviews_ctr)) with torch.no_grad(): q_i, q_h, inter_attn_score, context_vector = InterGRU( s_seqence, None, current_asins, current_reviewerIDs, review_rating=r_seqence) r_bar = q_i.squeeze(1) r_bar = (r_bar * (5 - 1) + 1) # Caculate Square loss of HANN r_u_i = torch.tensor( self.testing_batch_labels[idx][batch_ctr]).to(self.device) hann_loss = self._mean_square_error(r_bar, r_u_i) group_loss += hann_loss """ Greedy Search Strategy Decoder """ # Create initial decoder input (start with SOS tokens for each sentence) decoder_input = torch.LongTensor( [[self.SOS_token for _ in range(self.batch_size)]]) decoder_input = decoder_input.to(self.device) # # all one test # _all_one_point = [float(1.0) for _it in range(80)] # current_labels = torch.FloatTensor(_all_one_point).to(self.device) # Construct rating feature _encode_rating = _get_onehot_rating(r_u_i) # Set initial decoder hidden state to the inter_hidden's final hidden state decoder_hidden = q_h criterion = nn.NLLLoss() decoder_loss = 0 # Ground true sentences target_batch = self.testing_label_sentences[0][batch_ctr] target_variable, target_len, _ = target_batch target_variable = target_variable.to(self.device) # Generate max length max_target_len = self.setence_max_len # Initialize tensors to append decoded words to all_tokens = torch.zeros([0], device=self.device, dtype=torch.long) all_scores = torch.zeros([0], device=self.device) # Greedy search for t in range(max_target_len): if (t == 0 and _use_coverage): # Set up initial coverage probability initial_coverage_prob = torch.zeros( 1, self.batch_size, self.voc.num_words) initial_coverage_prob = initial_coverage_prob.to( self.device) DecoderModel.set_coverage_prob(initial_coverage_prob, _use_coverage) decoder_output, decoder_hidden, decoder_attn_weight = DecoderModel( decoder_input, decoder_hidden, context_vector, _encode_rating=_encode_rating, _user_emb=current_reviewerIDs, _item_emb=current_asins) # No teacher forcing: next input is decoder's own current output decoder_scores_, topi = decoder_output.topk(1) decoder_input = torch.LongTensor( [[topi[i][0] for i in range(self.batch_size)]]) decoder_input = decoder_input.to(self.device) ds, di = torch.max(decoder_output, dim=1) # Record token and score all_tokens = torch.cat((all_tokens, decoder_input), dim=0) all_scores = torch.cat( (all_scores, torch.t(decoder_scores_)), dim=0) # Coverage mechanism if (_use_coverage): _softmax_output = DecoderModel.get_softmax_output() _current_prob = _softmax_output.unsqueeze(0) if (t == 0): _previous_prob_sum = _current_prob else: # sum up previous probability _previous_prob_sum = _previous_prob_sum + _current_prob DecoderModel.set_coverage_prob( _previous_prob_sum, _use_coverage) pass pass # Calculate and accumulate loss nll_loss = criterion(decoder_output, target_variable[t]) decoder_loss += nll_loss pass # decoder loss of this epoch decoder_epoch_loss += decoder_loss.item() / float( max_target_len) """ Decode user review from search result. """ _bleu_score = { 'bleuScore-1': 0.0, 'bleuScore-2': 0.0, 'bleuScore-3': 0.0, 'bleuScore-4': 0.0 } _rouge_score = { 'rouge-1': { 'f': 0.0, 'p': 0.0, 'r': 0.0 }, 'rouge-2': { 'f': 0.0, 'p': 0.0, 'r': 0.0 }, 'rouge-l': { 'f': 0.0, 'p': 0.0, 'r': 0.0 } } for index_, user_ in enumerate(current_reviewerIDs): asin_ = current_asins[index_] current_user_tokens = all_tokens[:, index_].tolist() decoded_words = [ self.voc.index2word[token] for token in current_user_tokens if token != 0 ] try: product_title = self.asin2title[ self.itemObj.index2asin[asin_.item()]] except Exception as ex: product_title = 'None' pass # Show user attention inter_attn_score_ = inter_attn_score.squeeze(2).t() this_user_attn = inter_attn_score_[index_] this_user_attn = [ str(val.item()) for val in this_user_attn ] attn_text = ' ,'.join(this_user_attn) this_asin_input_reviewer = _reviewer_cat.t()[index_] input_reviewer = [ self.userObj.index2reviewerID[val.item()] for val in this_asin_input_reviewer ] # Show original sentences current_user_sen = target_variable[:, index_].tolist() origin_sen = [ self.voc.index2word[token] for token in current_user_sen if token != 0 ] generate_text = str.format(f""" ========================= Userid & asin:{self.userObj.index2reviewerID[user_.item()]},{self.itemObj.index2asin[asin_.item()]} title:{product_title} pre. consumer:{' ,'.join(input_reviewer)} Inter attn:{attn_text} Predict:{r_bar[index_].item()} Rating:{r_u_i[index_].item()} Generate: {' '.join(decoded_words)} Origin: {' '.join(origin_sen)} """) hypothesis = ' '.join(decoded_words) reference = ' '.join(origin_sen) #there may be several references # BLEU Score Calculation bleu_score_1_ = nltk.translate.bleu_score.sentence_bleu( [reference], hypothesis, weights=(1, 0, 0, 0)) bleu_score_2_ = nltk.translate.bleu_score.sentence_bleu( [reference], hypothesis, weights=(0, 1, 0, 0)) bleu_score_3_ = nltk.translate.bleu_score.sentence_bleu( [reference], hypothesis, weights=(0, 0, 1, 0)) bleu_score_4_ = nltk.translate.bleu_score.sentence_bleu( [reference], hypothesis, weights=(0, 0, 0, 1)) sentence_bleu_score = [ bleu_score_1_, bleu_score_2_, bleu_score_3_, bleu_score_4_ ] for num, val in enumerate(sentence_bleu_score): generate_text = (generate_text + str.format('BLEU-{}: {}\n'.format( (num + 1), val))) # Caculate bleu score of n-gram for _index, _gn in enumerate(_bleu_score): _bleu_score[_gn] += sentence_bleu_score[_index] if Epoch > 3: # ROUGE Score Calculation try: _rouge_score_current = rouge.get_scores( hypothesis, reference)[0] for _rouge_method, _metrics in _rouge_score_current.items( ): for _key, _val in _metrics.items(): _rouge_score[_rouge_method][_key] += _val pass except Exception as msg: pass # Write down sentences if _write_mode == 'generate': if self.test_on_train_data: fpath = (R'{}/GenerateSentences/on_train/'.format( self.save_dir)) else: fpath = (R'{}/GenerateSentences/on_test/'.format( self.save_dir)) with open( fpath + 'sentences_ep{}.txt'.format( self.training_epoch), 'a') as file: file.write(generate_text) if (write_insert_sql): # Write insert sql sqlpath = (fpath + 'insert.sql') self._write_generate_reviews_into_sqlfile( sqlpath, self.userObj.index2reviewerID[user_.item()], self.itemObj.index2asin[asin_.item()], ' '.join(decoded_words)) # Average bleu score through reviewer for _index, _gn in enumerate(average_bleu_score): average_bleu_score[_gn] += (_bleu_score[_gn] / len(current_reviewerIDs)) if Epoch > 3: # Average rouge score through reviewer for _rouge_method, _metrics in _rouge_score.items(): for _key, _val in _metrics.items(): average_rouge_score[_rouge_method][_key] += ( _val / len(current_reviewerIDs)) num_of_iter = len(self.testing_batches[0]) * len( self.testing_batch_labels) RMSE = group_loss / num_of_iter _nllloss = decoder_epoch_loss / num_of_iter batch_bleu_score = [ average_bleu_score[_gn] / num_of_iter for _gn in average_bleu_score ] if Epoch > 3: for _rouge_method, _metrics in average_rouge_score.items(): for _key, _val in _metrics.items(): average_rouge_score[_rouge_method][ _key] = _val / num_of_iter return RMSE, _nllloss, batch_bleu_score, average_rouge_score
a few glaring looks towards his team before winning the second set . Murray had to put such matters aside as he tackled the unusually talented Thiem, a delight to watch. Coached by Boris Becker's veteran mentor Gunter Bresnik, he slightly r esembles Andy Roddick and hits with similar power but more elegance. His single handed backhand is a thing of rare beauty. However, he has had a mediocre season coming into this event and there was little to forewarn of his glorious shotmak ing that seemed to catch Murray unawares early on. The world No 4 looked to have worked him out in the second, but then suffered one of his periopdic mental lap ses and let him back in from 4-1 before closing it out with a break. After break ing him for 3-1 in the decider the Austrian whirlwind burnt itself out. 'He's a strong guy who hits the ball hard and it became a very physical match,' said Mur ray. Murray was presented with a celebratory cake after winning his 500th match in the previous round """.replace('\n','') rouge = Rouge() tokenizer = BartTokenizer.from_pretrained('bart-large-cnn') model = BartForConditionalGeneration.from_pretrained('bart-large-cnn') model.to(torch_device) article_input_ids = tokenizer.batch_encode_plus([data_article], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device) summary_ids = model.generate(article_input_ids, num_beams=4, length_penalty=2.0, max_length=142, min_length=56, no_repeat_ngram_size=3) summary_txt = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True) pprint('Summary: '+ summary_txt) scores = rouge.get_scores(summary_txt, data_article) #display(Markdown('> **Summary: **'+summary_txt)) print('Rouge scores: ' + scores)
def cal_ROUGE(refer, candidate): if not candidate: candidate = 'unk' rouge = Rouge() scores = rouge.get_scores(' '.join(candidate), ' '.join(refer)) return scores[0]['rouge-2']['f']
seq2seq_inf = Seq2Seq_Inference(encoder_preprocessor=body_pp, decoder_preprocessor=title_pp) # this method displays the predictions on random rows of the holdout set seq2seq_inf.demo_model_predictions(n=5, issue_df=testdf) from rouge import Rouge rouge = Rouge() test_title_text = testdf.issue_title.tolist() test_body_text = testdf.body.tolist() predict_title_text = [None] * len(test_body_text) print(predict_title_text) rouge_1_p, rouge_1_r, rouge_1_f, rouge_2_p, rouge_2_r, rouge_2_f, rouge_l_p, rouge_l_f, rouge_l_r = 0, 0, 0, 0, 0, 0, 0, 0, 0 for i in range(len(test_body_text)): exm, predict_title_text[i] = seq2seq_inf.generate_issue_title( raw_input_text=test_body_text[i]) scores = rouge.get_scores(predict_title_text[i], test_title_text[i]) rouge_1_f = rouge_1_f + scores[0]['rouge-1']['f'] rouge_2_f = rouge_2_f + scores[0]['rouge-2']['f'] rouge_l_f = rouge_l_f + scores[0]['rouge-l']['f'] print("ROUGE-1:", rouge_1_f / len(test_body_text)) print("ROUGE-2:", rouge_2_f / len(test_body_text)) print("ROUGE-l:", rouge_l_f / len(test_body_text)) print("Average of ROUGE-1, ROUGE-2 and ROUGE-l: ", (rouge_1_f + rouge_2_f + rouge_l_f) / 3 / len(test_body_text))
# ,sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)) # ,sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))] row = [] for i in range(len(ref_sents)): r = ref_sents[i] c = pred_sents[i] reference = nlp(str(r)) reference = [[str(x) for x in list(reference)]] candidate = nlp(str(c)) candidate = [str(x) for x in list(candidate)] row.append([ sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)), sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)), sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)), sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)), rouge.get_scores(hyps=c, refs=r)[0]['rouge-l']['f'] ]) # candidate = nlp(pred_bm7) # candidate = [str(x) for x in list(candidate)] # df_result['bm7'] = [sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)) # ,sentence_bleu(reference, candidate, weights=(0.5, 0.5, 0, 0)) # ,sentence_bleu(reference, candidate, weights=(0.33, 0.33, 0.33, 0)) # ,sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25))] df_result = pd.DataFrame(row) df_result.columns = ['BLEU-1', 'BLEU-2', 'BLEU-3', 'BLEU-4', 'ROUGE-L'] df_result.round(3) # In[ ]: ref_sents, pred_sents
def evaluate(model_path, test_path, config_path, metric, is_multiple_ref, max_count, report_every, batch_size): params_path = config_path or os.path.join(model_path, "config.json") params = Params.from_file(params_path) is_subwords = "tokenizer" in params["reader"] and params["reader"][ "tokenizer"]["type"] == "subword" reader = DatasetReader.from_params(params.pop("reader")) device = 0 if torch.cuda.is_available() else -1 model = Model.load(params, model_path, cuda_device=device) model.training = False print(model) print("Trainable params count: ", sum(p.numel() for p in model.parameters() if p.requires_grad)) hyps = [] refs = [] predictor = Seq2SeqPredictor(model, reader) for batch in get_batches(reader, test_path, batch_size): outputs = predictor.predict_batch_json(batch) targets = [b.get('target') for b in batch] for output, target in zip(outputs, targets): decoded_words = output["predicted_tokens"] if not is_multiple_ref: hyp = detokenize( " ".join(decoded_words)) if not is_subwords else "".join( decoded_words).replace("▁", " ") if len(hyp.strip()) <= 1: hyp = "empty" print("Empty hyp") if len(target.strip()) <= 1: target = "empty" print("Empty target") ref = [target] else: if isinstance(target, list): reference_sents = target elif isinstance(target, str): reference_sents = target.split(" s_s ") else: assert False decoded_sents = (" ".join(decoded_words)).split("s_s") hyp = [ w.replace("<", "<").replace(">", ">").strip() for w in decoded_sents ] ref = [ w.replace("<", "<").replace(">", ">").strip() for w in reference_sents ] hyp = " ".join(hyp) ref = [" ".join(ref)] hyps.append(hyp) refs.append(ref) if len(hyps) % report_every == 0: print("Count: ", len(hyps)) print("Ref: ", ref) print("Hyp: ", hyp) if metric in ("bleu", "all"): from nltk.translate.bleu_score import corpus_bleu print("BLEU: ", corpus_bleu(refs, hyps)) if metric in ("rouge", "all"): rouge = Rouge() scores = rouge.get_scores(hyps, [r[0] for r in refs], avg=True) print("ROUGE: ", scores) if max_count and len(hyps) >= max_count: break
def decoder(args,model_config,model,vocab): model_dir_list = get_evaluate_top_k(args.output_dir) decoder_info_str = "\n".join(model_dir_list) decoder_info_file = os.path.join(args.output_dir,"decoder.txt") with open(decoder_info_file,"w",encoding='utf-8') as f: f.write(decoder_info_str) print("解码\n{}".format(decoder_info_str)) test_feature_dir = os.path.join(args.feature_dir, "test") feature_file_list = os.listdir(test_feature_dir) rouge = Rouge() model_iterator = trange(int(len(model_dir_list)), desc = "Model.bin File") with torch.no_grad(): for model_idx in model_iterator: model_dir = model_dir_list[model_idx] decoder_dir = model_dir predict_file = os.path.join(decoder_dir,"predict.txt") score_json = {} score_json_file = os.path.join(decoder_dir,"score.json") result_json = {} result_json_file = os.path.join(decoder_dir,"result.json") model_path_name = os.path.join(model_dir,"model.bin") model.load_state_dict(torch.load(model_path_name)) model = model.to(args.device) model.eval() file_iterator = trange(int(len(feature_file_list)), desc=decoder_dir) for file_idx in file_iterator: file = feature_file_list[file_idx] path_file = os.path.join(test_feature_dir,file) test_dataset,news_ids,oovs,titles = get_features_from_cache(path_file) test_sampler = SequentialSampler(test_dataset) train_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=1) data_iterator = tqdm(train_dataloader, desc=decoder_dir) for i, batch in enumerate(data_iterator): batch = from_feature_get_model_input(batch, hidden_dim=model_config.hidden_dim, device=args.device, pointer_gen=model_config.pointer_gen, use_coverage=model_config.use_coverage) news_id = news_ids[i] current_oovs = oovs[i] current_title = titles[i][1:-1] # 去掉start,stop beam = model(encoder_input = batch[0], encoder_mask= batch[1], encoder_with_oov = batch[2], oovs_zero = batch[3], context_vec = batch[4], coverage = batch[5], mode = "decode", beam_size = 10 ) # 去除 start token hypothesis_idx_list = beam.tokens[1:] if vocab.stop_idx == hypothesis_idx_list[-1]: hypothesis_idx_list = hypothesis_idx_list[:-1] hypothesis_token_list = [idx_to_token(index,oov_word = current_oovs,vocab = vocab) for index in hypothesis_idx_list] hypothesis_str = " ".join(hypothesis_token_list) reference_str = " ".join(current_title) result_str = "{}\t{}\t{}\n".format(news_id,reference_str,hypothesis_str) with open(file=predict_file,mode='a',encoding='utf-8') as f: f.write(result_str) f.close() rouge_score = rouge.get_scores(hyps = hypothesis_str,refs= reference_str) score_json[news_id] = rouge_score[0] with open(score_json_file, 'w') as f: json.dump(score_json,f) f.close() rouge_1_f = [] rouge_1_p = [] rouge_1_r = [] rouge_2_f = [] rouge_2_p = [] rouge_2_r = [] rouge_l_f = [] rouge_l_p = [] rouge_l_r = [] for name,score in score_json.items(): rouge_1_f.append(score["rouge-1"]['f']) rouge_1_p.append(score["rouge-1"]['p']) rouge_1_r.append(score["rouge-1"]['r']) rouge_2_f.append(score["rouge-2"]['f']) rouge_2_p.append(score["rouge-2"]['p']) rouge_2_r.append(score["rouge-2"]['r']) rouge_l_f.append(score["rouge-l"]['f']) rouge_l_p.append(score["rouge-l"]['p']) rouge_l_r.append(score["rouge-l"]['r']) mean_1_f = sum(rouge_1_f) / len(rouge_1_f) mean_1_p = sum(rouge_1_p) / len(rouge_1_p) mean_1_r = sum(rouge_1_r) / len(rouge_1_r) mean_2_f = sum(rouge_2_f) / len(rouge_2_f) mean_2_p = sum(rouge_2_p) / len(rouge_2_p) mean_2_r = sum(rouge_2_r) / len(rouge_2_r) mean_l_f = sum(rouge_l_f) / len(rouge_l_f) mean_l_p = sum(rouge_l_p) / len(rouge_l_p) mean_l_r = sum(rouge_l_r) / len(rouge_l_r) result_json['mean_1_f'] = mean_1_f result_json['mean_1_p'] = mean_1_p result_json['mean_1_r'] = mean_1_r result_json['mean_2_f'] = mean_2_f result_json['mean_2_p'] = mean_2_p result_json['mean_2_r'] = mean_2_r result_json['mean_l_f'] = mean_l_f result_json['mean_l_p'] = mean_l_p result_json['mean_l_r'] = mean_l_r with open(result_json_file, 'w') as f: # test.json文本,只能写入状态 如果没有就创建 json.dump(result_json, f) # data转换为json数据格式并写入文件 f.close()
contradiction_scores.append(contradiction) contradiction_bert_scores.append(contradiction_bert) invalid_simplification_scores.append( invalid_simplification) if print_scores: print("score:", score, end="\t") if copy: average_copy_length = util.average_copy_length( src_line, gen_line) average_copy_lengths.append(average_copy_length) if print_scores: print("average copy length:", average_copy_length, end="\t") if rouge: rouge_score = r.get_scores(gen_line, tgt_line) rouge_scores += rouge_score # if print_scores: # print("rouge:", rouge_score, end="\t") if print_scores: print() if cache_dir and (i + 1) % 500 == 0: if not no_test: np.save(cache_dir + "scores" + str(i + 1), contained_scores) np.save( cache_dir + "contained_bert_scores" + str(i + 1), contained_bert_scores) np.save(cache_dir + "missing_scores" + str(i + 1), missing_scores)
class LetsNet: def __init__(self, embedding_sz=5): self.encoder_model = SentenceTransformer('bert-base-nli-mean-tokens') self.rouge = Rouge() self.cluster_n = 5 self.embedding_sz = embedding_sz self.kmeans = KMeans(n_clusters=self.cluster_n) self.stop_words = set(stopwords.words('english')) def encode(self, sentences): sentence_embeddings = self.encoder_model.encode(sentences) return sentence_embeddings def getCentroidRepresentative(self, clusters, sentence_embeddings): centroids = [] for idx in range(self.cluster_n): centroid_id = np.where(clusters.labels_ == idx)[0] centroids.append(np.mean(centroid_id)) closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, sentence_embeddings) ordering = sorted(range(self.cluster_n), key=lambda k: centroids[k]) return closest, ordering def evaluate(self, model_sum, gt_sum): """ Gives rouge score :param model_sum: list of summaries returned by the model :param gt_sum: list of ground truth summary from catchphrases :return: ROUGE score """ return self.rouge.get_scores(model_sum, gt_sum, avg=True) def getSentenceSummary(self, sentences: list): """ Returns summary of sentence :param sentences: list of sentences :return: summary text """ sentence_enc = self.encode(sentences) clusters = self.kmeans.fit(sentence_enc) closest, ordering = self.getCentroidRepresentative( clusters, sentence_enc) summary = '.'.join([sentences[closest[idx]] for idx in ordering]).replace('\n', ' ') return summary def main(self): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] doc_n = len(gt) for doc_idx in range(20): print("{}/{}".format(doc_idx, doc_n)) full_text, catch_phrases = gt[doc_idx] summary = self.getSentenceSummary(full_text) model_sum.append(summary) gt_sum.append(".".join(catch_phrases)) print("ROUGE score: {}".format(self.evaluate(model_sum, gt_sum))) def getIntroductions(self): """ Returns the first catch phrase of every doc :return: void """ gt = getGroundTruth() intro_word_freq = {} for full_text, catch_phrases in gt[:500]: intro_words = catch_phrases[0].split(" ") for word in intro_words: if word not in self.stop_words: if word not in intro_word_freq: intro_word_freq[word] = 0 intro_word_freq[word] += 1 intro_words = [(word, freq) for word, freq in intro_word_freq.items()] intro_words.sort(key=lambda x: x[1], reverse=True) print(intro_words) def getConclusion(self): """ Returns the last catch phrase of every doc :return: void """ gt = getGroundTruth() conclusion_freq = {} for full_text, catch_phrases in gt[:500]: conclusion = catch_phrases[-1] if conclusion not in conclusion_freq: conclusion_freq[conclusion] = 0 conclusion_freq[conclusion] += 1 conclusions = [(word, freq) for word, freq in conclusion_freq.items()] conclusions.sort(key=lambda x: x[1], reverse=True) for conclusion, _ in conclusions: print(conclusion) def getHeadings(self): """ Returns the headings of whole text :return: void """ gt = getGroundTruth() pattern = re.compile(r'.+(\n )+\n.+') for full_text, catch_phrases in gt[:1]: print("".join(full_text)) headings = [] for sent in full_text: if pattern.search(sent) is not None: sent = re.sub(r'(\n( )*)+\n', r'\n', sent) headings.append(sent) print(len(headings)) for heading in headings: print("============================") print(heading)
O = [] P = [] for i in range(len(xTest)): try: o = seq2summary(yTest[i]) p = decode_sequence(xTest[i].reshape(1, textLength)) O.append(str(o)) P.append(str(p)) except Exception as e: pass rouge = Rouge() scores = rouge.get_scores(P, O, avg=True) f = open("GloVeOutputScores.txt", "w") f.write(scores) f.close() f = open("GloVeOutputExamples.txt", "w") for i in range(30): f.write("Review:", seq2text(xTest[i])) f.write("Original summary:", seq2summary(yTest[i])) f.write("Predicted summary:", decode_sequence(xTest[i].reshape(1, textLength))) f.write("----------") f.close()
class LetsNet: def __init__(self, embedding_sz=5): self.encoder_model = SentenceTransformer('bert-base-nli-mean-tokens') self.rouge = Rouge() self.cluster_n = 5 self.embedding_sz = embedding_sz self.kmeans = KMeans(n_clusters=self.cluster_n) def encode(self, sentences): sentence_embeddings = self.encoder_model.encode(sentences) features_n = len(sentence_embeddings[0]) sentences_n = len(sentences) norm_embedding = [[embed_i[idx] for idx in range(features_n)] for embed_i in sentence_embeddings] for idx in range(features_n): features = [embed_i[idx] for embed_i in sentence_embeddings] min_feature_val = min(features) max_feature_val = max(features) range_feature_val = max_feature_val - min_feature_val for sent_idx in range(sentences_n): norm_embedding[sent_idx][idx] = (norm_embedding[sent_idx][idx]-min_feature_val)/range_feature_val pca_embedding = [np.array([norm_vec[idx] for idx in range(features_n)]) for norm_vec in norm_embedding] # print(pca_embedding) # pca_embedding = np.copy(sentence_embeddings[0, 1, 2, 3, 4, 5]) return pca_embedding def getCentroidRepresentative(self, clusters, sentence_embeddings): centroids = [] for idx in range(self.cluster_n): centroid_id = np.where(clusters.labels_ == idx)[0] centroids.append(np.mean(centroid_id)) closest, _ = pairwise_distances_argmin_min(clusters.cluster_centers_, sentence_embeddings) ordering = sorted(range(self.cluster_n), key=lambda k: centroids[k]) return closest, ordering def evaluate(self, model_sum, gt_sum): """ Gives rouge score :param model_sum: list of summaries returned by the model :param gt_sum: list of ground truth summary from catchphrases :return: ROUGE score """ return self.rouge.get_scores(model_sum, gt_sum, avg=True) def getSentenceSummary(self, sentences: list): """ Returns summary of sentence :param sentences: list of sentences :return: summary text """ sentence_enc = self.encode(sentences) clusters = self.kmeans.fit(sentence_enc) closest, ordering = self.getCentroidRepresentative(clusters, sentence_enc) summary = '.'.join([sentences[closest[idx]] for idx in ordering]).replace('\n', ' ') return summary def main(self): """ Executes the entire pipeline of the code :return: void """ gt = getGroundTruth() model_sum, gt_sum = [], [] doc_n = len(gt) for doc_idx in range(20): print("{}/{}".format(doc_idx, doc_n)) full_text, catch_phrases = gt[doc_idx] summary = self.getSentenceSummary(full_text) model_sum.append(summary) gt_sum.append(".".join(catch_phrases)) print("ROUGE score: {}".format(self.evaluate(model_sum, gt_sum)))
decodetext += line f.close() f = open(os.path.join(referencedir, file[0:6] + '_reference.txt'), 'r', encoding='utf-8') reftext = '' for line in f.readlines(): reftext += line f.close() ref_len += len(reftext.split()) gen_len += len(decodetext.split()) reference = [reftext] candidate = decodetext rouge_score = rouge.get_scores(decodetext, reftext) rougescore1 += rouge_score[0]["rouge-1"]['r'] rougescore2 += rouge_score[0]["rouge-2"]['r'] rougescorel += rouge_score[0]["rouge-l"]['r'] bleuscore1 += sentence_bleu(reference, candidate, weights=(1, 0, 0, 0)) bleuscore2 += sentence_bleu(reference, candidate, weights=(0, 1, 0, 0)) bleuscoren += sentence_bleu(reference, candidate, weights=(0.25, 0.25, 0.25, 0.25)) bleuscore1 /= len(dir_or_files) bleuscore2 /= len(dir_or_files) bleuscoren /= len(dir_or_files) rougescore1 /= len(dir_or_files) rougescore2 /= len(dir_or_files)
def getScore(m_prediction,m_original): rouge = Rouge() scores = rouge.get_scores(m_prediction,m_original) debug('['+m_prediction+']['+m_original+'] = >score is['+str(scores)+']') return scores
def run_test(model, dataset, loader, model_name, hps): test_dir = os.path.join( hps.save_root, "test") # make a subdir of the root dir for eval data eval_dir = os.path.join(hps.save_root, "eval") if not os.path.exists(test_dir): os.makedirs(test_dir) if not os.path.exists(eval_dir): logger.exception( "[Error] eval_dir %s doesn't exist. Run in train mode to create it.", eval_dir) raise Exception( "[Error] eval_dir %s doesn't exist. Run in train mode to create it." % (eval_dir)) resfile = None if hps.save_label: log_dir = os.path.join(test_dir, hps.cache_dir.split("/")[-1]) resfile = open(log_dir, "w") logger.info("[INFO] Write the Evaluation into %s", log_dir) model = load_test_model(model, model_name, eval_dir, hps.save_root) model.eval() iter_start_time = time.time() with torch.no_grad(): logger.info("[Model] Sequence Labeling!") tester = SLTester(model, hps.m, limited=hps.limited, test_dir=test_dir) for i, (G, index) in enumerate(loader): if hps.cuda: G.to(torch.device("cuda")) tester.evaluation(G, index, dataset, blocking=hps.blocking) running_avg_loss = tester.running_avg_loss if hps.save_label: # save label and do not calculate rouge json.dump(tester.extractLabel, resfile) tester.SaveDecodeFile() logger.info(' | end of test | time: {:5.2f}s | '.format( (time.time() - iter_start_time))) return logger.info("The number of pairs is %d", tester.rougePairNum) if not tester.rougePairNum: logger.error("During testing, no hyps is selected!") sys.exit(1) if hps.use_pyrouge: if isinstance(tester.refer[0], list): logger.info("Multi Reference summaries!") scores_all = utils.pyrouge_score_all_multi(tester.hyps, tester.refer) else: scores_all = utils.pyrouge_score_all(tester.hyps, tester.refer) else: rouge = Rouge() scores_all = rouge.get_scores(tester.hyps, tester.refer, avg=True) res = "Rouge1:\n\tp:%.6f, r:%.6f, f:%.6f\n" % (scores_all['rouge-1']['p'], scores_all['rouge-1']['r'], scores_all['rouge-1']['f']) \ + "Rouge2:\n\tp:%.6f, r:%.6f, f:%.6f\n" % (scores_all['rouge-2']['p'], scores_all['rouge-2']['r'], scores_all['rouge-2']['f']) \ + "Rougel:\n\tp:%.6f, r:%.6f, f:%.6f\n" % (scores_all['rouge-l']['p'], scores_all['rouge-l']['r'], scores_all['rouge-l']['f']) logger.info(res) tester.getMetric() tester.SaveDecodeFile() logger.info( '[INFO] End of test | time: {:5.2f}s | test loss {:5.4f} | '.format( (time.time() - iter_start_time), float(running_avg_loss)))
# from rouge_score import rouge_scorer # scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True) # # scores = scorer.score("The quick brown fox jumps over the lazy dog", "the quick brown dog jumps on the log.") # scores = scorer.score("the quick brown dog jumps on the log.", "the quick brown dog jumps on log, it is a quick thing, I like it so much") # print("scores", scores) from rouge import Rouge hyp = "the quick brown dog jumps on the log." ref = "the quick brown dog jumps on log, it is a quick thing, I like it so much" rouge = Rouge() scores = rouge.get_scores(ref, hyp) print("scores", scores)
filtered_system_summary.append(sW.lower()) filtered_ref_summary = [] for rW in ref_summ_tokens: if rW not in util.stop_words and rW not in util.punc: filtered_ref_summary.append(rW.lower()) #Calculate Rouge-1 scores: Using precision and recall base on word overlaps between system summary and reference summary overlap_tokens = 0 tokens_sys_summ = len(filtered_system_summary) tokens_ref_summ = len(filtered_ref_summary) for w in filtered_system_summary: if (w in filtered_ref_summary): #print(w) DEBUG overlap_tokens += 1 filtered_ref_summary.remove(w) recall = overlap_tokens / tokens_ref_summ precision = overlap_tokens / tokens_sys_summ f_score = 2 / ((1 / precision) + (1 / recall)) print('\nROUGE-1 Scores\nF-SCORE: %f\nPRECISION: %f\nRECALL: %f\n' % (f_score, precision, recall)) #Scores generated by Rouge package - considers all words print('Results from Rouge package') rouge = Rouge() scores = rouge.get_scores(sys_summ, ref_summ, avg=True) print(scores)
r.model_filename_pattern = 'tmp.[A-Z].#ID#.txt' output = r.convert_and_evaluate() print(output) output_dict = r.output_to_dict(output) ################################################################## ## 第二种, 纯 Python 实现 from rouge import Rouge from pprint import pprint ################################################################## ## Score 1 sentence hypothesis = "the #### transcript is a written version of each day 's cnn student news program use this transcript to he lp students with reading comprehension and vocabulary use the weekly newsquiz to test your knowledge of storie s you saw on cnn student news" reference = "this page includes the show transcript use the transcript to help students with reading comprehension and vocabulary at the bottom of the page , comment for a chance to be mentioned on cnn student news . you must be a teac her or a student age # # or older to request a mention on the cnn student news roll call . the weekly newsquiz tests students ' knowledge of even ts in the news" rouge = Rouge() scores = rouge.get_scores(hypothesis, reference) pprint(scores) # [{'rouge-1': {'f': 0.49411764217577864, # 'p': 0.5833333333333334, # 'r': 0.42857142857142855}, # 'rouge-2': {'f': 0.23423422957552154, # 'p': 0.3170731707317073, # 'r': 0.18571428571428572}, # 'rouge-l': {'f': 0.42751590030718895, # 'p': 0.5277777777777778, # 'r': 0.3877551020408163}}] print(scores[0]['rouge-l']['f']) # 0.42751590030718895 ################################################################## ## Score multiple sentences hyps = ['i am jiaruipeng', 'hello world', 'ni hao'] refs = ['jiaruipeng is good', 'world is wonderful', 'wo hao']
def rouge_compute(hyps, refs): rouge = Rouge() scores = rouge.get_scores(hyps, refs, avg=True) return scores
print("-") print("TRG:", a) print("-") try: print("GEN:", text) except: print("GEN: `Skip this example. Possible error occurred in decoding text since gpt2 generated irrigular coding.`") print("-"*50, flush=True) refs.append(a.lower()) hyps.append(text.lower()) save_pred.append([' '.join(q.split(' ')[-6:]), text, a]) hyps = [(x if x != "" else "<|endoftext|>") for x in hyps] # rouge of 200 samples rouge = Rouge() scores = rouge.get_scores(hyps, refs, avg=True) print("ROUGE-1 : ", scores['rouge-1']) print("ROUGE-2 : ", scores['rouge-2']) print("ROUGE-L : ", scores['rouge-l']) # bleu of 200 samples warnings.simplefilter("ignore") score = corpus_bleu([[ref.split(' ')] for ref in refs], [hyp.split(' ') for hyp in hyps]) print("BLEU : ", score) # save prediction to file with open(args.pred_file, 'w') as csvfile: writer = csv.writer(csvfile, delimiter=',') writer.writerows(save_pred) sys.stdout.flush()
class Loader: def __init__(self, name): self.name = name self.train_data = {} self.train_data['text'] = [] self.train_data['label'] = [] self.train_data["candi"] = [] self.rouge = Rouge() def get_document(self, document): sentences = possess_sentence(document) return sentences def get_labels(self, label): sentences = possess_sentence(label) return sentences def get_score(self, sen1, sen2): score = 0 rouge_score = self.rouge.get_scores(sen1, sen2) score += rouge_score[0]["rouge-1"]['r'] score += rouge_score[0]["rouge-2"]['r'] score += rouge_score[0]["rouge-l"]['r'] return score / 3 def pad_and_add_token(self, poss_data, max_len): data_list = [] for x in poss_data: if len(x) >= max_len - 2: x = x[0:max_len - 3] x.append(102) l = x x = [101] x.extend(l) while len(x) < max_len: x.append(0) data_list.append(x) return data_list def check_data(self, path1, pair_num): fo = open(path1, "r", encoding='gb18030', errors='ignore') print("Start to check") index = 0 for i in range(pair_num * 10): line1 = fo.readline() line1 = line1.replace("\n", "") if len(line1) == 0: print("hit error at ", i) index += 1 print(index, "sentence is completed") fo.close() def gen_data(self, path1, path2, pairs_num): fo = open(path1, "r", encoding='gb18030', errors='ignore') f = open(path2, 'w') number = 0 print("----Start to generate candi data----") for i in range(pairs_num): line1 = fo.readline() line1 = line1.strip() if line1 == None: continue do = self.get_document(line1) sentence = {} document = " ".join(do) for o in do: if o != None: try: sentence[o] = self.get_score(o, document) except Exception as e: pass continue sort_sentences = sorted(sentence.items(), key=lambda x: x[1], reverse=True) candidata_sentence_set = sort_sentences[:5] sentences = [] for i in candidata_sentence_set: sentences.append(i[0]) while len(sentences) < 5: sentences.append(sentences[0]) indices = list(combinations(sentences, 2)) candidata = [] for i in indices: candidata.append(" ".join(i)) number += len(candidata) for j in candidata: f.write(j) f.write('\n') f.close() print("----gen finished with ", number, "----") def read_data(self, path1, path2, path3, pairs_num, max_len=128, init_flag=True): print("----start Read train data----") fo = open(path1, "r", encoding='gb18030', errors='ignore') fl = open(path2, "r", encoding='gb18030', errors='ignore') candi_list = [] pbar = ProgressBar(n_total=pairs_num, desc='Loading') if init_flag: self.gen_data(path1, path3, pairs_num) self.check_data(path3, pairs_num) fc = open(path3, "r", encoding='gb18030', errors='ignore') origin_labels = [] origin_candi = [] for i in range(pairs_num): pbar(i, {'current': i}) line1 = fo.readline() line2 = fl.readline() if line1 == None or line2 == None: continue #line1="A ##SENT## B ##SENT## C ##SENT## D ##SENT## E ##SENT## F" do = self.get_document(line1) la = self.get_labels(line2) document = " ".join(do) la = " ".join(la) origin_labels.append(la) candidata_data = [] temp_candi = [] for j in range(10): temp = fc.readline() temp = temp.replace("\n", "") temp_candi.append(temp) if len(temp) == 0: print("Hit bad Trap at", i * 10 + j) candidata_data.append( tokenizer.encode(temp, add_special_tokens=False)) #print(len(candidata_data)) #print(candidata_data[0]) origin_candi.append(temp_candi) self.train_data['text'].append( tokenizer.encode(document, add_special_tokens=False)) self.train_data['label'].append( tokenizer.encode(la, add_special_tokens=False)) self.train_data['candi'].append(candidata_data) data_list = self.pad_and_add_token(self.train_data['text'], max_len) label_list = self.pad_and_add_token(self.train_data['label'], max_len) pos = 0 for i in self.train_data['candi']: pos += 1 temp = self.pad_and_add_token(i, max_len) candi_list.append(temp) train_data = torch.tensor(data_list) train_label = torch.tensor(label_list) train_candi = torch.tensor(candi_list) return train_data, train_label, train_candi, origin_labels, origin_candi
listfinal = sorted(pr, key=pr.get) #print type(summarylines) for i in range(numlines - 1, numlines - int(summarylines) - 1, -1): print listfinal[i] #printing summary of desired no of lines listfinal_str = ''.join(map(str, listfinal)) #print(type(listfinal_str)) with open(sys.argv[3], 'r') as myfile: data = myfile.read().replace('\n', '') #print(type(data)) #Rouge score calculation for forward variant print "_________________Rouge Score for backward variant _________________________" rouge = Rouge() scores = rouge.get_scores(listfinal_str, data) print scores for k in range(0, numlines): g1 = [] sentence1 = "" sentence1 = G2.nodes()[k] p1 = sentence1 stop_words = set(stopwords.words('english')) #forming set of the stopwords word_tokens = word_tokenize(p1) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = []
_iter = 435864 dec_path = args.beam_dir + args.mode + '_iter_' + str(_iter) + '_beam_size_' + str(args.beam_size) + '/' + 'rouge_dec_dir/' + '*.txt' print(dec_path) print('decode:', len(glob.glob(dec_path))) hyps = [' '.join(open(f).readlines()) for f in glob.glob(dec_path)] print('hyps:', len(hyps)) print() print('hyps first 10 lines:') print('\n'.join(hyps[:10])) print() print('hyps last 10 lines:') print('\n'.join(hyps[-10:])) print() if args.mode == 'final': with open('result.txt', 'w') as f: for line in hyps: f.write(line.replace("\n", "\\n") + '\n') else: ref_path = args.beam_dir + args.mode + '_iter_' + str(_iter) + '_beam_size_' + str(args.beam_size) + '/' + 'rouge_ref_dir/' + '*.txt' print('reference:', len(glob.glob(ref_path))) refs = [open(f).readline() for f in glob.glob(ref_path)] print('refs:') print('\n'.join(refs[:10])) rouge = Rouge() scores = rouge.get_scores(hyps, refs, avg=True) print(scores)
def main(): smoothie = SmoothingFunction().method4 data_dir_path = 'data' model_dir_path = 'models' print('loading csv file ...') df = pd.read_csv(data_dir_path + "/lenta_test.csv") X = df['text'] Y = df['title'] # loading our model model_path = Embedding_Seq2SeqSummarizer.get_config_file_path( model_dir_path=model_dir_path) with open(model_path, 'rb') as data: config = pickle.load(data) summarizer = Embedding_Seq2SeqSummarizer(config) summarizer.load_weights( weight_file_path=Embedding_Seq2SeqSummarizer.get_weight_file_path( model_dir_path=model_dir_path)) print('start predicting ...') result = '' bleus = [] beam_bleus = [] rouge = Rouge() refs, greedy_hyps, beam_hyps = [], [], [] # some decent examples demo = [3, 5, 31, 36, 37, 47, 54, 55, 99, 19, 39, 119] for i in demo: # for i in range(50): x = X[i] actual_headline = Y[i] refs.append(actual_headline) headline = summarizer.summarize(x) greedy_hyps.append(headline) beam_headline = summarizer.beam_search(x, 3) beam_hyps.append(beam_headline) bleu = sentence_bleu([word_tokenize(actual_headline.lower())], word_tokenize(headline), smoothing_function=smoothie) bleus.append(bleu) beam_bleu = sentence_bleu([word_tokenize(actual_headline.lower())], word_tokenize(beam_headline), smoothing_function=smoothie) beam_bleus.append(beam_bleu) # if i % 200 == 0 and i != 0: # print(i) # print("BLEU: ", np.mean(np.array(bleus))) # print("BEAM BLEU: ", np.mean(np.array(beam_bleus))) print(f'№ {i}') # print('Article: ', x) print('Original Headline: ', actual_headline) print('Generated Greedy Headline: ', headline) print('Generated Beam Headline: ', beam_headline) print('\n') print('__________METRICS SUMMARY____________') avg_greedy_scores = rouge.get_scores(greedy_hyps, refs, avg=True) rouge1f = avg_greedy_scores['rouge-1']['f'] rouge2f = avg_greedy_scores['rouge-2']['f'] rougelf = avg_greedy_scores['rouge-l']['f'] score = np.mean([rouge1f, rouge2f, rougelf]) print('Greedy Rouge (Dialogue 2019): ', score) avg_beam_scores = rouge.get_scores(beam_hyps, refs, avg=True) rouge1f = avg_beam_scores['rouge-1']['f'] rouge2f = avg_beam_scores['rouge-2']['f'] rougelf = avg_beam_scores['rouge-l']['f'] score = np.mean([rouge1f, rouge2f, rougelf]) print('Beam search Rouge (Dialogue 2019): ', score) def average(lst): return float(sum(lst)) / float(len(lst)) print("Greedy Bleu: ", average(bleus)) print("Beam search Bleu: ", average(beam_bleus)) print('_____________________________________')
Usage: --file1=hypos --file2=real --output=output") parser.add_argument("--file1", type=str) parser.add_argument("--file2", type=str) parser.add_argument("--output", type=str) args = parser.parse_args() trans_lines, refer_lines, candidates, references = read_file( args.file1, args.file2) #print(candidates[:5]) #print(references[:5]) bleu_1 = corpus_bleu(references, candidates, weights=(1, 0, 0, 0)) bleu_4 = corpus_bleu(references, candidates) rouge_ = Rouge() rouge_score = rouge_.get_scores(trans_lines, refer_lines) rouge_1 = rouge_2 = rouge_l = 0 for score in rouge_score: rouge_1 += score['rouge-1']['r'] rouge_2 += score['rouge-2']['r'] rouge_l += score['rouge-l']['f'] rouge_1 /= len(rouge_score) rouge_2 /= len(rouge_score) rouge_l /= len(rouge_score) metrics = "bleu-1: {}, bleu-4: {}, rouge-1: {}, rouge-2: {}, rouge-l: {}".format( "%.4f" % bleu_1, "%.4f" % bleu_4, "%.4f" % rouge_1, "%.4f" % rouge_2, "%.4f" % rouge_l) with open(args.output + '/metrics.txt', 'w', encoding='utf-8') as f: f.write(metrics)
temp_indices = [] for i in range(0, j): temp_indices.append(indices[i]) temp_indices.sort() hypothesis = "" # our list of chosen sentences for i in range(0, j): hypothesis += sentences[temp_indices[i]] # print(hypothesis) reference = page.summary # provided wikipedia summary ref_sent = sent_tokenize(reference) reference = "" for i in range(0, len(ref_sent)): reference += ref_sent[i] rouge = Rouge() score_text = rouge.get_scores(hypothesis, reference) f = score_text[0]['rouge-l']['f'] p = score_text[0]['rouge-l']['p'] r = score_text[0]['rouge-l']['r'] score = (f, p, r) scores.append(score) if len(scores) < 5: for i in range(len(scores), 5): scores.append((0, 0, 0)) total_scores.append(scores) text_lengths.append(text_length) # sorting bin200 = [] bin500 = [] bin1000 = []
def run_training(generator, discriminator, generator_batcher, discriminator_batcher, summary_writer, sess_context_manager): print( '#########################################################################' ) print('Start Adversarial Training...') with sess_context_manager as sess: D_rewards = np.zeros((FLAGS.batch_size, FLAGS.max_dec_steps)) rouge_rewards = np.zeros((FLAGS.batch_size, 1)) while True: # Train the generator for one step for it in range(1): batch = generator_batcher.next_batch() batch.batch_reward = D_rewards batch.batch_rouge_reward = rouge_rewards tf.logging.info('running training step...') t0 = time.time() result_train = generator.run_train_step(sess, batch) t1 = time.time() tf.logging.info('seconds for training step: %.3f', t1 - t0) loss = result_train['loss'] tf.logging.info('Generator train loss: %f', loss) # print the loss to screen summaries = result_train['summaries'] train_step = result_train['global_step'] summary_writer.add_summary(summaries, train_step) # write the summaries rg = Rouge() gtruth_token = batch.target_batch output_sample_token = np.transpose( np.squeeze(result_train['output_sample_token'])) output_argmax_token = np.transpose( np.squeeze(result_train['output_summary_token'])) def remove_eos(input_text): _input_text_eos = np.where(input_text == 3)[0] if len(_input_text_eos) != 0: cliped_text = input_text[:_input_text_eos[0]] else: cliped_text = input_text return ' '.join(map(str, cliped_text)) rouge_rewards = [] for gt, sample, argmax in zip(gtruth_token, output_sample_token, output_argmax_token): _gt = remove_eos(gt) _sample = remove_eos(sample) _argmax = remove_eos(argmax) r_baseline = rg.get_scores(_gt, _argmax)[0]['rouge-l']['f'] r_sample = rg.get_scores(_gt, _sample)[0]['rouge-l']['f'] rouge_rewards.append(r_baseline - r_sample) rouge_rewards = np.reshape(rouge_rewards, [FLAGS.batch_size, 1]) tf.logging.info('RL reward for rouge-L: %.3f', np.mean(rouge_rewards)) tf.logging.info('running rollout step...') t0 = time.time() result_rollout = generator.run_rollout_step(sess, batch) t1 = time.time() tf.logging.info('seconds for rollout step: %.3f', t1 - t0) rollout_output = result_rollout[ 'rollout_token'] # shape [rollout_num, seqlen(this is number of roll), batch_size, seq_len] given_number_of_rollout = rollout_output.shape[1] # calculate D_reward print("start to calculate D_rewards") _feed_output_token = np.reshape(rollout_output, [-1, FLAGS.max_dec_steps]) feed_output_token = [] for sent in _feed_output_token: index_list = np.where(sent == 3)[0] if len(index_list) != 0: ind = index_list[0] new_sent = np.concatenate( [sent[:ind + 1], np.ones(100 - ind - 1)]) feed_output_token.append(new_sent) else: new_sent = np.array(sent, dtype=np.int32) feed_output_token.append(new_sent) feed_output_token = np.array(feed_output_token) feed_output_token = feed_output_token.reshape( (len(feed_output_token), -1)) print("feed_out_token.shape:", feed_output_token.shape) ''' clip_index = np.where(feed_output_token > FLAGS.vocab_size-1) index_x = clip_index[0] index_y = clip_index[1] for i in range(len(index_x)): feed_output_token[index_x[i]][index_y[i]] = 0 ''' if feed_output_token.shape[1] > 1: for i in range(len(feed_output_token)): clip_index = np.where( np.array(feed_output_token[i]) > FLAGS.vocab_size - 1) for idx in clip_index: feed_output_token[i][idx] = 0 # update ypred_for_auc = [] for feed_output_token_small in np.split( feed_output_token, FLAGS.rollout): feed = { discriminator.input_x: feed_output_token_small, discriminator.dropout_keep_prob: 1.0 } # ypred_for_auc: [rollout_num * seqlen(this is number of roll) * batch_size, 2] ypred_for_auc.append( sess.run(discriminator.ypred_for_auc, feed)) ypred_for_auc = np.concatenate(ypred_for_auc) ypred = np.array([item[1] for item in ypred_for_auc]) framed_yred = np.reshape(ypred, [ FLAGS.rollout, given_number_of_rollout, FLAGS.batch_size ]) rewards = np.transpose(np.sum(framed_yred, 0)) / ( 1.0 * FLAGS.rollout ) # [batch_size, output_max_len// 20] if np.std(rewards) != 0.: rewards = (rewards - np.mean(rewards)) / np.std(rewards) D_rewards = np.zeros( (FLAGS.batch_size, FLAGS.max_dec_steps)) print("rewards.shape:", rewards.shape) for count, i in enumerate( range(1, FLAGS.max_dec_steps, int(FLAGS.max_dec_steps / rewards.shape[1]))): D_rewards[:, i] = rewards[:, count] else: tmp = [] for i in range(len(feed_output_token)): tmp.append(feed_output_token[i][0]) feed_output_token = np.array(tmp).copy() print("feed-new:", feed_output_token.shape) print("Filter out!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") # Train the discriminator print("Start to train the Discriminator!") for _ in tqdm(range(5)): batch = discriminator_batcher.next_batch() res = generator.run_summary_token_step(sess, batch) _output_argmax_summary = res['output_summary_token'] _output_argmax_summary = np.transpose( np.squeeze( _output_argmax_summary)) # [batch_size, max_dec_steps] gtruth_data = batch.target_batch # [batch_size, max_dec_steps]; format: [[], [], ...] output_argmax_summary = [] for sent in _output_argmax_summary: index_list = np.where(sent == 3)[0] if len(index_list) != 0: ind = index_list[0] new_sent = np.concatenate([ sent[:ind + 1], np.ones(FLAGS.max_dec_steps - ind - 1) ]) output_argmax_summary.append(new_sent) else: output_argmax_summary.append(sent) output_argmax_summary = np.array(output_argmax_summary) positive_examples = [] negative_examples = [] for ele in gtruth_data: positive_examples.append(ele) for ele in output_argmax_summary: negative_examples.append(ele) dis_data_loader = Dis_dataloader(FLAGS.batch_size, FLAGS.vocab_size) max_epoch = 3 for epoch in range(max_epoch): dis_data_loader.load_data(positive_examples, negative_examples) dis_data_loader.reset_pointer() for it in range(dis_data_loader.num_batch): x_batch, y_batch = dis_data_loader.next_batch() feed = { discriminator.input_x: x_batch, discriminator.input_y: y_batch, discriminator.dropout_keep_prob: 0.5 } _ = sess.run(discriminator.train_op, feed)
def multi_generate(importance, start, end): """ 複数作品まとめて確認したいとき """ corpus_accessor = CorpusAccessor() output_file_path = 'result_start_' + str(start) + '_end_' + str( end) + '.txt' file = open(output_file_path, 'w') love_story_s = LSTMSummarizer() love_story_supplier = LSTMVectorSupplier( 'love_story', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) love_story_s.set_supplier(love_story_supplier) love_story_s.set_trained_model() fantasy_s = LSTMSummarizer() fantasy_supplier = LSTMVectorSupplier( 'fantasy', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) fantasy_s.set_supplier(fantasy_supplier) fantasy_s.set_trained_model() literature_s = LSTMSummarizer() literature_supplier = LSTMVectorSupplier( 'literature', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) literature_s.set_supplier(literature_supplier) literature_s.set_trained_model() sf_s = LSTMSummarizer() sf_supplier = LSTMVectorSupplier('sf', importance, use_data_of_position_of_sentence=True, use_data_of_is_serif=True, use_data_of_is_include_person=True, use_data_of_sentence_length=True) sf_s.set_supplier(sf_supplier) sf_s.set_trained_model() # sys.setrecursionlimit(20000) rouge = Rouge() for i, ncode in enumerate(corpus_accessor.exist_ncodes[start:end]): print('processed ncode count: ', i) genre = corpus_accessor.get_genre(ncode) if len(genre) == 0: print('non genre') continue ref = ''.join(corpus_accessor.get_synopsis_lines(ncode)) synopsis = '' if genre == 'love_story': synopsis = love_story_s.generate(ncode) elif genre == 'fantasy': synopsis = fantasy_s.generate(ncode) elif genre == 'literature': synopsis = literature_s.generate(ncode) elif genre == 'sf': synopsis = sf_s.generate(ncode) score = rouge.get_scores(wakati(synopsis), wakati(ref), False)[0]['rouge-1']['r'] file.write(ncode + '\n') file.write(genre + '\n') file.write('score: ' + str(score) + '\n') file.write(ref + '\n\n') file.write(synopsis + '\n\n\n') file.close()
def validation_step(self, batch, batch_idx): rouge = Rouge() source_tensor, target_tensor, no_sos, no_eos = batch target_tensor = target_tensor.view(1, self.padding) target_tensor = target_tensor.type(torch.LongTensor).to(target_tensor.device) no_sos = no_sos.view(1, self.padding) no_sos = no_sos.type(torch.LongTensor).to(no_sos.device) no_eos = no_eos.view(1, self.padding) no_eos = no_eos.type(torch.LongTensor).to(no_eos.device) # ________ # COMPUTE LOSS # ________ output = self(source_tensor, no_eos) output_dim = output.shape[-1] ignore_index = DataUtils().text2index(["<pad>"], DataUtils().vocab_word2int(self.path_to_vocab_file_all))[0][0] criterion = nn.CrossEntropyLoss(ignore_index=ignore_index) loss = criterion(output.view(-1, output_dim), no_sos.view(-1)) # ________ # COMPUTE METRICS # ________ # comment if batch size > 1, removed for batch sizes above 1 flat_list = [] # sentence representation in int for sublist in target_tensor[0].tolist(): flat_list.append(sublist) hypothesis = DataUtils().int2text(flat_list, DataUtils().vocab_int2word(self.path_to_vocab_file_all)) hypothesis = list(filter("<pad>".__ne__, hypothesis)) hypothesis = list(filter("<eos>".__ne__, hypothesis)) hypothesis = list(filter("<sos>".__ne__, hypothesis)) hyp_str = " ".join(hypothesis) # FULL # online approach decoded_words = [] for ot in range(output.size(0)): topv, topi = output[ot].topk(1) if topi[0].item() == self.EOS_token: decoded_words.append('<eos>') break else: decoded_words.append(topi[0].item()) # ONE BY ONE # uncomment to use, this approach takes longer to reproduce results. Original approach from "attention is all you need" # memory = self.model.transformer.encoder(self.model.pos_encoder(source_tensor)) # sos_index = DataUtils().text2index(["<sos>"], DataUtils().vocab_word2int(self.path_to_vocab_file_all))[0][0] # decoded_words = [sos_index, ] # # for i in range(self.max_length): # trg_tensor = torch.LongTensor(decoded_words).unsqueeze(1).to(device) # # output = self.model.fc_out(self.model.transformer.decoder(self.model.pos_decoder(self.model.decoder_emb(trg_tensor)), memory)) # out_token = output.argmax(2)[-1].item() # decoded_words.append(out_token) # if out_token == DataUtils().text2index(["<eos>"], DataUtils().vocab_word2int(self.path_to_vocab_file_all))[0][0]: # break reference = DataUtils().int2text(decoded_words, DataUtils().vocab_int2word(self.path_to_vocab_file_all)) reference = list(filter("<pad>".__ne__, reference)) reference = list(filter("<eos>".__ne__, reference)) reference = list(filter("<sos>".__ne__, reference)) reference = " ".join(reference[:len( hypothesis)]) # cut too long sentences, can be uncommented if model starts to predict correct lengths ref_str = " ".join(reference) print(f"\nhyp_str: {hyp_str}") print(f"ref_str: {ref_str}") bleu1_score = round(sentence_bleu([reference], hypothesis, weights=(1, 0, 0, 0)), 4) bleu2_score = round(sentence_bleu([reference], hypothesis, weights=(0.5, 0.5, 0, 0)), 4) bleu3_score = round(sentence_bleu([reference], hypothesis, weights=(0.33, 0.33, 0.33, 0)), 4) bleu4_score = round(sentence_bleu([reference], hypothesis, weights=(0.25, 0.25, 0.25, 0.25)), 4) meteor_score = round(single_meteor_score(ref_str, hyp_str), 4) wer_score = round(wer(hyp_str, ref_str), 4) try: rouge_score = round(rouge.get_scores(hyp_str, ref_str)[0]["rouge-l"]["f"], 4) except ValueError: rouge_score = 0.0 self.metrics["bleu1"].append(bleu1_score) self.metrics["bleu2"].append(bleu2_score) self.metrics["bleu3"].append(bleu3_score) self.metrics["bleu4"].append(bleu4_score) self.metrics["meteor"].append(meteor_score) self.metrics["rouge"].append(rouge_score) self.metrics["wer"].append(wer_score) self.writer.add_scalars(f'metrics', { 'bleu1': mean(self.metrics["bleu1"]), 'bleu2': mean(self.metrics["bleu2"]), 'bleu3': mean(self.metrics["bleu3"]), 'bleu4': mean(self.metrics["bleu4"]), 'meteor': mean(self.metrics["meteor"]), 'rouge': mean(self.metrics["rouge"]), 'wer': mean(self.metrics["wer"]), }, self.current_epoch) self.writer.add_scalar('lr', self.learning_rate, self.current_epoch) # reset self.metrics = {"bleu1": [], "bleu2": [], "bleu3": [], "bleu4": [], "meteor": [], "rouge": [], "wer": []} return {'val_loss': loss.item()}