def language_eval_excoco(predictions, predictions_bleu, sents_label_eval, loader): Scorer = CiderD() Bleu_scorer = Bleu(4) METEOR_scorer = Meteor() ROUGE_scorer = Rouge() c_score, _ = Scorer.compute_score(sents_label_eval, predictions) b_score, _ = Bleu_scorer.compute_score(sents_label_eval, predictions_bleu) m_score, _ = METEOR_scorer.compute_score(sents_label_eval, predictions_bleu) r_score, _ = ROUGE_scorer.compute_score(sents_label_eval, predictions_bleu) print('Evaluating {} samples'.format(len(predictions))) print('Bleu_1 : ' + str(b_score[0])) print('Bleu_2 : ' + str(b_score[1])) print('Bleu_3 : ' + str(b_score[2])) print('Bleu_4 : ' + str(b_score[3])) print('METEOR : ' + str(m_score)) print('ROUGE_L : ' + str(r_score)) print('CIDEr : ' + str(c_score)) lang_stat = {} lang_stat['BLEU_1'] = b_score[0] lang_stat['BLEU_2'] = b_score[1] lang_stat['BLEU_3'] = b_score[2] lang_stat['BLEU_4'] = b_score[3] lang_stat['METEOR'] = m_score lang_stat['ROUGE_L'] = r_score lang_stat['CIDEr'] = c_score return lang_stat
def test(model, dataloader, args): scorer = Bleu(4) m_scorer = Meteor() r_scorer = Rouge() hyp = [] ref = [] model.eval() gold_file = open('tmp_gold.txt', 'w') pred_file = open('tmp_pred.txt', 'w') with tqdm(dataloader, desc='Test ', mininterval=1) as tq: for batch in tq: with torch.no_grad(): seq = model(batch, beam_size=args.beam_size) r = write_txt(batch, batch['tgt_text'], gold_file, args) h = write_txt(batch, seq, pred_file, args) hyp.extend(h) ref.extend(r) hyp = dict(zip(range(len(hyp)), hyp)) ref = dict(zip(range(len(ref)), ref)) print(hyp[0], ref[0]) print('BLEU INP', len(hyp), len(ref)) print('BLEU', scorer.compute_score(ref, hyp)[0]) print('METEOR', m_scorer.compute_score(ref, hyp)[0]) print('ROUGE_L', r_scorer.compute_score(ref, hyp)[0]) gold_file.close() pred_file.close()
def test(model_path='models/model-61', video_feat_path=video_feat_path): train_data, test_data = get_video_data(video_data_path, video_feat_path, train_ratio=0.7) test_videos = test_data['video_path'].values test_captions = test_data['Description'].values ixtoword = pd.Series(np.load('./data/ixtoword.npy').tolist()) test_videos_unique = list() test_captions_list = list() for (video, caption) in zip(test_videos, test_captions): if len(test_videos_unique) == 0 or test_videos_unique[-1] != video: test_videos_unique.append(video) test_captions_list.append([caption]) else: test_captions_list[-1].append(caption) model = Video_Caption_Generator( dim_image=dim_image, n_words=len(ixtoword), dim_embed=dim_embed, dim_hidden=dim_hidden, batch_size=batch_size, encoder_max_sequence_length=encoder_step, decoder_max_sentence_length=decoder_step, bias_init_vector=None) video_tf, video_mask_tf, caption_tf, probs_tf, last_embed_tf = model.build_generator() sess = tf.InteractiveSession() saver = tf.train.Saver() saver.restore(sess, model_path) scorer = Meteor() scorer_bleu = Bleu(4) GTS = defaultdict(list) RES = defaultdict(list) counter = 0 for (video_feat_path, caption) in zip(test_videos_unique, test_captions_list): generated_sentence = gen_sentence( sess, video_tf, video_mask_tf, caption_tf, video_feat_path, ixtoword) print video_feat_path, generated_sentence #print caption GTS[str(counter)] = [{'image_id':str(counter),'cap_id':i,'caption':s} for i, s in enumerate(caption)] RES[str(counter)] = [{'image_id':str(counter),'caption':generated_sentence[:-2]+'.'}] #GTS[video_feat_path] = caption #RES[video_feat_path] = [generated_sentence[:-2] + '.'] counter += 1 #ipdb.set_trace() tokenizer = PTBTokenizer() GTS = tokenizer.tokenize(GTS) RES = tokenizer.tokenize(RES) score, scores = scorer.compute_score(GTS, RES) print "METEOR", score score, scores = scorer_bleu.compute_score(GTS, RES) print "BLEU", score
def bleu(): scorer = Bleu(n=4) # scorer += (hypo[0], ref1) # hypo[0] = 'word1 word2 word3 ...' # # ref = ['word1 word2 word3 ...', 'word1 word2 word3 ...'] score, scores = scorer.compute_score(gts, res) print('belu = %s' % score)
class CaptionStatsManager(nt.StatsManager): def __init__(self): super(CaptionStatsManager, self).__init__() def init(self): super(CaptionStatsManager, self).init() self.tokenized_true = {} self.tokenized_pred = {} self.scorer = Bleu(4) self.running_bleu_scores = [0 for _ in range(4)] def accumulate(self, loss, x, y, d): super(CaptionStatsManager, self).accumulate(loss, x, y, d) self.tokenized_true[0] = [] self.tokenized_pred[0] = [] _, pred_cap_lab = torch.max(y, 1) true_cap_lab = d pred_cap = index_to_cap(pred_cap_lab) true_cap = index_to_cap(true_cap_lab) self.tokenized_true[0].append(true_cap) self.tokenized_pred[0].append(pred_cap) bleu_scores, _ = self.scorer.compute_score(self.tokenized_true, self.tokenized_pred) self.running_bleu_scores = list(map(add, self.running_bleu_scores, bleu_scores)) def summarize(self): # this is the average loss when called loss = super(CaptionStatsManager, self).summarize() # this is the average accuracy percentage when called bleu_score = [ a / self.number_update for a in self.running_bleu_scores] return {'loss' : loss, 'bleu' : bleu_score}
class TextCapsBleu4Evaluator: def __init__(self): # The following script requires Java 1.8.0 and pycocotools installed. # The pycocoevalcap can be installed with pip as # pip install git+https://github.com/ronghanghu/coco-caption.git@python23 # Original pycocoevalcap code is at https://github.com/tylin/coco-caption # but has no python3 support yet. try: from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer except ModuleNotFoundError: print( "Please install pycocoevalcap module using " "pip install git+https://github.com/ronghanghu/coco-caption.git@python23" # noqa ) raise self.tokenizer = PTBTokenizer() self.scorer = Bleu(4) def eval_pred_list(self, pred_list): # Create reference and hypotheses captions. gts = {} res = {} for idx, entry in enumerate(pred_list): gts[idx] = [{"caption": a} for a in entry["gt_answers"]] res[idx] = [{"caption": entry["pred_answer"]}] gts = self.tokenizer.tokenize(gts) res = self.tokenizer.tokenize(res) score, _ = self.scorer.compute_score(gts, res) bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4) return bleu4
def eval_div_stats(dataset, preds_n, model_id, split): tokenizer = PTBTokenizer() capsById = {} for i, d in enumerate(preds_n): d['id'] = i capsById[d['image_id']] = capsById.get(d['image_id'], []) + [d] n_caps_perimg = len(capsById[list(capsById.keys())[0]]) print(n_caps_perimg) _capsById = capsById # save the untokenized version capsById = tokenizer.tokenize(capsById) div_1, adiv_1 = compute_div_n(capsById, 1) div_2, adiv_2 = compute_div_n(capsById, 2) globdiv_1, _ = compute_global_div_n(capsById, 1) print( 'Diversity Statistics are as follows: \n Div1: %.2f, Div2: %.2f, gDiv1: %d\n' % (div_1, div_2, globdiv_1)) # compute mbleu scorer = Bleu(4) all_scrs = [] scrperimg = np.zeros((n_caps_perimg, len(capsById))) for i in range(n_caps_perimg): tempRefsById = {} candsById = {} for k in capsById: tempRefsById[k] = capsById[k][:i] + capsById[k][i + 1:] candsById[k] = [capsById[k][i]] score, scores = scorer.compute_score(tempRefsById, candsById) all_scrs.append(score) scrperimg[i, :] = scores[1] all_scrs = np.array(all_scrs) out = {} out['overall'] = {'Div1': div_1, 'Div2': div_2, 'gDiv1': globdiv_1} for k, score in zip(range(4), all_scrs.mean(axis=0).tolist()): out['overall'].update({'mBLeu_%d' % (k + 1): score}) imgToEval = {} for i, imgid in enumerate(capsById.keys()): imgToEval[imgid] = {'mBleu_2': scrperimg[:, i].mean()} imgToEval[imgid]['individuals'] = [] for j, d in enumerate(_capsById[imgid]): imgToEval[imgid]['individuals'].append(preds_n[d['id']]) imgToEval[imgid]['individuals'][-1]['mBleu_2'] = scrperimg[j, i] out['ImgToEval'] = imgToEval print( 'Mean mutual Bleu scores on this set is:\nmBLeu_1, mBLeu_2, mBLeu_3, mBLeu_4' ) print(all_scrs.mean(axis=0)) return out
def coco_evaluate(self, path1: str, path2: str, kaldi_stream: str, kaldi_scp: str, caption_file: str, max_length: int = None, output: str = "coco_scores.txt"): key2pred = self._ensemble(path1, path2, kaldi_stream, kaldi_scp, max_length) caption_df = pd.read_json(caption_file) caption_df["key"] = caption_df["filename"].apply( lambda x: os.path.splitext(x)[0]) key2refs = caption_df.groupby(["key"])["caption"].apply(list).to_dict() from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.cider.cider import Cider from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.spice.spice import Spice f = open(output, "w") scorer = Bleu(n=4) score, scores = scorer.compute_score(key2refs, key2pred) for n in range(4): f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) scorer = Rouge() score, scores = scorer.compute_score(key2refs, key2pred) f.write("ROUGE: {:6.3f}\n".format(score)) scorer = Cider() score, scores = scorer.compute_score(key2refs, key2pred) f.write("CIDEr: {:6.3f}\n".format(score)) scorer = Meteor() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Meteor: {:6.3f}\n".format(score)) scorer = Spice() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Spice: {:6.3f}\n".format(score)) f.close()
class Metrics: def __init__(self): pass def bleu(self, hypo, ref): self.bleu_scorer = Bleu(4) final_scores = {} score, scores = self.bleu_scorer.compute_score(ref, hypo) for m, s in zip(["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"], score): final_scores[m] = s return final_scores
def get_corpus_bleu(model, data_loader, vocabs, device, beam_size): import torch from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.cider.cider import Cider from pycocoevalcap.rouge.rouge import Rouge """Defining Scorers""" scorer_bleu = Bleu(4) scorer_rouge = Rouge() scorer_cider = Cider() sequences_ref = {} sequences_gen = {} bad_words = ['<SOS>', '<EOS>', '<UNK>'] bad_toks = [vocabs['word_vocab'](i) for i in bad_words] """Generation Loop""" for i, data in enumerate(data_loader): with torch.no_grad(): captions = data['captions'] length = captions.size(1) - 1 targets = captions.narrow(1, 1, length) images = data['images'].to(device) topics = data['topics'].to(device) predictions = model.sample_v2(images, topics, beam_size=beam_size) sequences_ref[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in targets[0] if j.item() not in bad_toks ]) ] sequences_gen[i] = [ " ".join([ vocabs['word_vocab'](j.item()) for j in predictions[0][1] if j.item() not in bad_toks ]) ] # sequences_gen[i] = [" ".join([vocabs['word_vocab'](j) for j in predictions[0] if j not in bad_toks])] """Getting Scores""" bleu_score, bleu_scores = scorer_bleu.compute_score( sequences_ref, sequences_gen) rouge_score, rouge_scores = scorer_rouge.compute_score( sequences_ref, sequences_gen) cider_score, cider_scores = scorer_cider.compute_score( sequences_ref, sequences_gen) scores = { 'bleu_score': bleu_score, 'rouge_score': rouge_score, 'cider_score': cider_score } print(scores) return scores
def val_score(self, s_start=0, num_batches=2): bs = self.imp["BATCH_SIZE"] bleu = Bleu() eval_store_gen = {} eval_store_gt = {} num_examples = self.test_data.dec_in.get_num_seqs() max_num_batches = num_examples / bs for i in xrange(min(num_batches, max_num_batches)): s = s_start + bs * i e = s_start + bs * (i + 1) gen_txt = self.generate(s=s, allow_unk=False) gt_txt = self.test_data.dec_out.get_text(s, e) fnames = self.test_data.filenames[s:e] for g, f in zip(gen_txt, fnames): if f not in eval_store_gen: eval_store_gen[f] = [" ".join(g)] for g, f in zip(gt_txt, fnames): if f not in eval_store_gt: eval_store_gt[f] = [] eval_store_gt[f].append(" ".join(g)) print bleu.compute_score(eval_store_gt, eval_store_gen)[0]
def _define_metrics(gts, res): bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def compute_bleu_score(decode_res, keys, gts, start_idx, end_idx, vocabulary): """ Args: decode_res: decoding results of model, [B, max_length] keys: keys of this batch, tuple [B,] gts: ground truth sentences of all audios, dict(<key> -> [ref_1, ref_2, ..., ref_n]) Return: score: scores of this batch, [B,] """ from pycocoevalcap.bleu.bleu import Bleu scorer = Bleu(4) hypothesis = {} references = {} for i in range(decode_res.shape[0]): if keys[i] in hypothesis: continue # prepare candidate candidate = [] for t, w_t in enumerate(decode_res[i]): if w_t == start_idx: continue elif w_t == end_idx: break else: candidate.append(vocabulary.idx2word[w_t]) hypothesis[keys[i]] = [ " ".join(candidate), ] # prepare reference references[keys[i]] = gts[keys[i]] (score, scores) = scorer.compute_score(references, hypothesis) key2score = {key: scores[3][i] for i, key in enumerate(hypothesis.keys())} results = np.zeros(decode_res.shape[0]) for i in range(decode_res.shape[0]): results[i] = key2score[keys[i]] return results
def eval(result_gts_path, result_res_path): with open(result_gts_path, 'r') as file: gts_dict = json.load(file) with open(result_res_path, 'r') as file: res_dict = json.load(file) bleu_score = Bleu(n=4) bleu, _ = bleu_score.compute_score(gts=gts_dict, res=res_dict) meteor_score = Meteor() meteor, _ = meteor_score.compute_score(gts=gts_dict, res=res_dict) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts_dict, res=res_dict) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts_dict, res=res_dict) return bleu, meteor, rouge, cider
class RougeBleuScore(Metric): def __init__(self, coco, vocab, n = 4): self.coco = coco self.vocab = vocab self.bleu = Bleu(n) self.n = n self.rouge = Rouge() def evaluate(self, y_pred, y, image_ids): if type(y_pred) == list: caption_pred_list = caption_list_to_words(y_pred, self.vocab) else: caption_pred_list = tensor_to_words(y_pred, y, self.vocab) captions_pred, captions_gt = extract_captions(image_ids, caption_pred_list, self.coco) blockPrint() scores = self.bleu.compute_score(captions_gt, captions_pred)[0] enablePrint() scores.append(self.rouge.compute_score(captions_gt, captions_pred)[0]) return scores
def bleu_scorer(reference, hypothesis): # ================================================= # Compute scores # ================================================= scorer = Bleu(4) method = ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"] # print('computing %s score...' % (scorer.method())) score, scores = scorer.compute_score(reference, hypothesis) bleus = {} if type(method) == list: for sc, scs, m in zip(score, scores, method): # print("%s: %0.3f" % (m, sc)) bleus[m] = sc else: # print("%s: %0.3f" % (method, score)) bleus[method] = score return bleus
def calculate_metric(rnn, meteor=None): gts = {} res = {} lp_avg = 0.0 lp_c = 0 for idx in range(rnn.V_valid.shape[0]): iid = rnn.Id_valid[idx] if iid not in gts: gts[iid] = [] #gts[iid].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) gts[iid] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[iid] ] if iid in res: continue res[iid] = [] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) (lp, pos_sen) = decoder_beamsearch(rnn, rnn.V_valid[idx], senti=1.0, beam_size=1) pos_sen = pos_sen[:-1] print(' '.join(pos_sen[::-1])) res[iid].append(' '.join(pos_sen[::-1])) lp_avg += np.exp(lp) lp_c += 1 lp_avg /= float(lp_c) return lp_avg bleu = Bleu() print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) if meteor is None: meteor = Meteor() print("Meteor:") mscore = meteor.compute_score(gts, res)[0] print("Positive:", mscore) return mscore
def eval_epoch_bleu(model, validation_data, device, vocab, list_of_refs_dev, args): ''' Epoch operation in evaluation phase ''' model.eval() total_loss = 0 n_word_total = 0 n_word_correct = 0 hypotheses = {} count = 0 with torch.no_grad(): for batch in tqdm( validation_data, mininterval=2, desc=' - (Validation) ', leave=False): # prepare data image0, image1, image0_attribute, image1_attribute = map(lambda x: x.to(device), batch) """[src/tgt/memory]_key_padding_mask should be a ByteTensor where True values are positions that should be masked with float('-inf') and False values will be unchanged. This mask ensures that no information will be taken from position i if it is masked, and has a separate mask for each sequence in a batch.""" hyp = beam_search(image0, image1, model, args, vocab, image0_attribute, image1_attribute) hyp = hyp.split("<end>")[0].strip() hypotheses[count] = [hyp] count += 1 scorer = Bleu(4) score, _ = scorer.compute_score(list_of_refs_dev, hypotheses) return score
class TextCapsBleu4Evaluator: def __init__(self): # The following script requires Java 1.8.0 and pycocotools installed. # The pycocoevalcap can be installed with pip from M4C-Captioner's Github repo # but has no python3 support yet. from pycocoevalcap.tokenizer.ptbtokenizer import PTBTokenizer from pycocoevalcap.bleu.bleu import Bleu self.tokenizer = PTBTokenizer() self.scorer = Bleu(4) def eval_pred_list(self, pred_list): # Create reference and hypotheses captions. gts = {} res = {} for idx, entry in enumerate(pred_list): gts[idx] = [{'caption': a} for a in entry['gt_answers']] res[idx] = [{'caption': entry['pred_answer']}] gts = self.tokenizer.tokenize(gts) res = self.tokenizer.tokenize(res) score, _ = self.scorer.compute_score(gts, res) bleu4 = score[3] # score is (Bleu-1, Bleu-2, Bleu-3, Bleu-4) return bleu4
def train(args): iter_per_epoch = int(math.ceil(conf.num_coco_data * 1.0 / conf.batch_size)) pkl_path = os.path.join(conf.val_small_data_path, 'val_caption.pkl') with open(pkl_path, 'rb') as f: caption_data = pickle.load(f) pass image_id_list = caption_data.keys() bleu_test = Bleu() vocab, reverse_vocab = utils.load_dict(conf.dictionary_path) with tf.device('/cpu:0'): train_image_batch, train_sequence_batch = get_data.batch_train_data( 'train', conf.batch_size, conf.shuffer_buffer_size, 6, conf.train_data_path) val_dataset = get_data.batch_val_data('val', conf.batch_size, 6, conf.val_small_data_path) val_id_batch, val_image_batch = get_data.make_val_iterator(val_dataset) pass logging.info("The input graph defined!") with tf.variable_scope(tf.get_variable_scope()) as scope: train_model = ShowAttendTell(first_time=args.first_time, start_token_index=vocab[conf.start_token], pad_token_index=vocab[conf.pad_token], mat_file=conf.vgg_checkpoint, max_timestep=conf.sentence_length, train_vgg=conf.train_vgg) batch_loss, perplexity, _ = train_model.build_model() scope.reuse_variables() generated_words = train_model.build_validation() pass ave_train_loss = tf.Variable(0, name='ave_train_loss', dtype=tf.float32, trainable=False) bleu1 = tf.Variable(0, name='bleu1', dtype=tf.float32, trainable=False) bleu2 = tf.Variable(0, name='bleu2', dtype=tf.float32, trainable=False) bleu3 = tf.Variable(0, name='bleu3', dtype=tf.float32, trainable=False) bleu4 = tf.Variable(0, name='bleu4', dtype=tf.float32, trainable=False) tf.summary.scalar('ave_train_loss', ave_train_loss) tf.summary.scalar('batch_loss', batch_loss) tf.summary.scalar('batch_perplexity', perplexity) tf.summary.scalar('bleu1', bleu1) tf.summary.scalar('bleu2', bleu2) tf.summary.scalar('bleu3', bleu3) tf.summary.scalar('bleu4', bleu4) all_variable = tf.trainable_variables() for variable in all_variable: tf.summary.histogram(variable.op.name, variable) pass all_gradient = tf.gradients(batch_loss, all_variable) for index, variable in enumerate(all_variable): tf.summary.histogram(variable.op.name + "/gradient", all_gradient[index]) pass with open(conf.global_step_file ) as fd1: # for logging the last global step saved number = int(fd1.readline().strip()) pass global_step_t = tf.Variable(number, name='global_step', trainable=False) learning_rate = tf.train.exponential_decay(conf.learning_rate, global_step_t, conf.decay_step, conf.decay_rate, staircase=True) # optimizer = tf.train.AdamOptimizer(learning_rate=conf.learning_rate) optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate) # for updating the moving average and variance in batch norm update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) with tf.control_dependencies(update_ops): train_op = optimizer.minimize(batch_loss, global_step=global_step_t) pass logging.info("The optimization operation defined!") saver = tf.train.Saver(max_to_keep=80) ckpt_filename = os.path.join(conf.ckpt_upper_path, 'model.ckpt') with tf.Session() as sess: if args.load_ckpt: newest_checkpoint = tf.train.latest_checkpoint( conf.ckpt_upper_path) utils.restore(sess, newest_checkpoint) pass new_folder_name = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") log_whole_path = os.path.join(conf.model_log_path, new_folder_name) if not os.path.exists(log_whole_path): os.makedirs(log_whole_path) pass merged_summary = tf.summary.merge_all() summary_writer = tf.summary.FileWriter(log_whole_path) summary_writer.add_graph(sess.graph) sess.run(tf.global_variables_initializer()) total_loss = 0.0 start_time = time.time() all_time = 0 counter = 0 # b = 30 # for e in range(1): # for i in range(b): for _ in range(conf.epoch): for _ in range(iter_per_epoch): counter += 1 logging.info("In iter %d " % (counter)) image_batch_data, sequence_batch_data = sess.run( [train_image_batch, train_sequence_batch]) feed_dict = { train_model.input_image: image_batch_data, train_model.input_caption: sequence_batch_data } batch_loss_value, batch_perplexity_value, _ = sess.run( [batch_loss, perplexity, train_op], feed_dict=feed_dict) logging.info("batch loss: %s " % batch_loss_value) logging.info("batch perplexity value: %s " % batch_perplexity_value) total_loss += batch_loss_value if counter % 100 == 0: prediction = {} while True: try: val_id_batch_data, val_image_batch_data = sess.run( [val_id_batch, val_image_batch]) pass except tf.errors.OutOfRangeError: with tf.device('/cpu:0'): val_id_batch, val_image_batch = get_data.make_val_iterator( val_dataset) pass break val_feed_dict = { train_model.input_image: val_image_batch_data } caption = sess.run(generated_words, feed_dict=val_feed_dict) for index, id in enumerate(val_id_batch_data): sentence = utils.get_sentence( caption[index], reverse_vocab) prediction[int(id)] = [sentence] pass random_id = random.choice(image_id_list) logging.info("Prediction %s " % prediction[random_id][0]) logging.info("Label %s " % caption_data[random_id][0]) print len(caption_data.keys()) print len(prediction.keys()) score, _ = bleu_test.compute_score(caption_data, prediction) # print "score ", score logging.info("Bleu1 %f " % (score[0])) logging.info("Bleu2 %f " % (score[1])) logging.info("Bleu3 %f " % (score[2])) logging.info("Bleu4 %f " % (score[3])) sess.run(bleu1.assign(score[0])) sess.run(bleu2.assign(score[1])) sess.run(bleu3.assign(score[2])) sess.run(bleu4.assign(score[3])) pass if counter % 50 == 0: sess.run( ave_train_loss.assign(total_loss * 1.0 / (counter))) logging.info("train average loss %f " % (total_loss * 1.0 / (counter))) summary = sess.run(merged_summary, feed_dict=feed_dict) summary_writer.add_summary( summary, tf.train.global_step(sess, global_step_t)) summary_writer.flush() pass if counter % 300 == 0: with open(conf.global_step_file, 'w') as fd: fd.write(str(tf.train.global_step(sess, global_step_t))) pass saver.save(sess, ckpt_filename, global_step=global_step_t) new_time = time.time() time_range = new_time - start_time start_time = new_time all_time += time_range logging.info("batch %d take %f \n" % (counter, time_range)) pass pass pass logging.info("Average time %f " % (all_time * 1.0 / counter)) summary_writer.close() pass
def end_epoch(self, ): path = Path(Options()["exp.dir"]) dirname = path.joinpath("generated_sentences") # Create directory if it does not exist if not os.path.exists(dirname): try: os.makedirs(dirname) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # Dump sentences to the directory for field in ["action", "justification"]: for key in ["ground_truth", "predicted"]: filepath = dirname.joinpath("%s_%s.txt" % (key, field)) with open(filepath, "w") as f: f.write("\n".join(self.sentences[key][field])) # Compute NLP quality scores (bleu, meteor, cider...) for field in ["action", "justification"]: cider = Cider() bleu = Bleu() meteor = Meteor() # Check if this is not empty if len(self.sentences["ground_truth"][field]) > 0: ground_truth = { i: [sentence] for i, sentence in enumerate(self.sentences["ground_truth"] [field]) } predicted = { i: [sentence] for i, sentence in enumerate(self.sentences["predicted"] [field]) } cider_score, _ = cider.compute_score(ground_truth, predicted) cider_score = cider_score * 100 # Convert to percentage bleus_score, _ = bleu.compute_score(ground_truth, predicted) bleu_score = bleus_score[ 3] * 100 # Take bleu-4 and convert to percentage meteor_score, _ = meteor.compute_score(ground_truth, predicted) meteor_score = meteor_score * 100 # Convert to percentage else: # Otherwise all scores are 0 cider_score, bleu_score, meteor_score = 0, 0, 0 Logger().log_value('%s_epoch.cider_%s' % (self.mode, field), cider_score, should_print=True) Logger().log_value('%s_epoch.bleucoco_%s' % (self.mode, field), bleu_score, should_print=True) Logger().log_value('%s_epoch.meteorcoco_%s' % (self.mode, field), meteor_score, should_print=True) # Reset sentences self.sentences = { "ground_truth": { "action": [], "justification": [] }, "predicted": { "action": [], "justification": [] } } return
def coco_caption_metrics_hier(predicts_list, sentences_list, image_id_list, config, batch_size=26, is_training=True): with open(config.vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} gts = {} res = {} for i in range(0, predicts_list.__len__()): for j in range(0, batch_size): sent_pre, sent_gt = [], [] for k in range(config.max_sentence_num * config.max_sentence_length): id_input = int(predicts_list[i][k][j]) sent_pre.append(id2word[id_input]) id_gt = sentences_list[i][j][k] if (not id2word[id_gt].__eq__('</S>')) and ( not id2word[id_gt].__eq__('<EOS>')): sent_gt.append(id2word[id_gt]) # sent_pre2 = sent_pre sent_pre2 = [] for n in range(config.max_sentence_num): for m in range(config.max_sentence_length): word = sent_pre[n * config.max_sentence_length + m] if word != '</S>': sent_pre2.append(word) else: break str_pre, str_gt = ' '.join(sent_pre2), ' '.join(sent_gt) image_id = image_id_list[i][j][0] gts[str(image_id)] = [str_gt] res[str(image_id)] = [str_pre] if not is_training: with open(config.result_gts_path, 'w') as file: json.dump(gts, file) with open(config.result_res_path, 'w') as file: json.dump(res, file) bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) # # # meteor_scorer = Meteor() # meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) # return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4) return bleu, round(rouge, 4), round(cider, 4)
def bleu(gts, res): scorer = Bleu(n=4) score, scores = scorer.compute_score(gts, res) out_file.write('BLEU(1-4) = %s' % score + '\n')
def get_bleu_score(self): bleu = Bleu() scores = bleu.compute_score(self.eval_store_gt, self.eval_store_gen)[0] return scores
def coco_caption_metrics(predictions_list, image_id_list, vocabulary_path='data/vocabulary.json', max_caption_length=25, batch_size=32, is_training=True): with open(vocabulary_path, 'r') as file: vocabulary_list = json.load(file) word2id = {} for i in range(vocabulary_list.__len__()): word2id[vocabulary_list[i]] = i id2word = {v: k for k, v in word2id.items()} with open('data/captions_gt.json', 'r') as file: captions_gt_dict = json.load(file) gts = {} res = {} for i in range(0, predictions_list.__len__()): for j in range(0, batch_size): sen_input, sen_ground_truth = [], [] for k in range(max_caption_length): id_input = int(predictions_list[i][k][j]) sen_input.append(id2word[id_input]) sen_pre = [] for n in range(max_caption_length): word = sen_input[n] if word != '</S>': sen_pre.append(word) else: break str_input = ' '.join(sen_pre) image_id = image_id_list[i][j][0] # print(image_id) res[image_id] = [str_input] gts[image_id] = captions_gt_dict[str(image_id)] if not is_training: # for key in gts.keys(): # str_input = res[key] # str_grundtruth = gts[key] # print(key) # print(str_input) # print(str_grundtruth) # print('*' * 100) with open('data/result/result_res.json', 'w') as file: json.dump(res, file) with open('data/result/result_gts.json', 'w') as file: json.dump(gts, file) # print('result.json get success') bleu_scorer = Bleu(n=4) bleu, _ = bleu_scorer.compute_score(gts=gts, res=res) rouge_scorer = Rouge() rouge, _ = rouge_scorer.compute_score(gts=gts, res=res) cider_scorer = Cider() cider, _ = cider_scorer.compute_score(gts=gts, res=res) meteor_scorer = Meteor() meteor, _ = meteor_scorer.compute_score(gts=gts, res=res) for i in range(4): bleu[i] = round(bleu[i], 4) return bleu, round(meteor, 4), round(rouge, 4), round(cider, 4)
def main(): bleu_test = Bleu() vocab, reverse_vocab = utils.load_dict(conf.dictionary_path) pkl_path = os.path.join(conf.val_data_path, 'val_caption.pkl') with open(pkl_path, 'rb') as f: caption_data = pickle.load(f) image_id_list = caption_data.keys() image_to_show = set(random.sample(image_id_list, 10)) with tf.device('/cpu:0'): val_dataset = get_data.batch_val_data('val', conf.batch_size, 6, conf.val_data_path) val_id_batch, val_image_batch = get_data.make_val_iterator(val_dataset) pass logging.info("The input graph defined!") with tf.variable_scope(tf.get_variable_scope()) as scope: train_model = ShowAttendTell(first_time=False, start_token_index=vocab[conf.start_token], pad_token_index=vocab[conf.pad_token], max_timestep=conf.sentence_length) caption_generator = InferenceWrapper(train_model, vocab[conf.start_token], vocab[conf.end_token], beam_size=3) caption_generator.build_inference_model() pass # saver = tf.train.Saver() result = {} counter = 0 with tf.Session() as sess: newest_checkpoint = tf.train.latest_checkpoint(conf.ckpt_upper_path) utils.restore(sess, newest_checkpoint) while True: counter += 1 logging.info("Batch %d " % counter) try: val_id_batch_data, val_image_batch_data = sess.run([val_id_batch, val_image_batch]) pass except tf.errors.OutOfRangeError: break for index, image_id in enumerate(val_id_batch_data): caption = caption_generator.run_inference(sess, val_image_batch_data[index]) if len(caption) == 0: sentence = "" else: sentence = utils.get_sentence(caption[0][0], reverse_vocab) pass result[int(image_id)] = [sentence] if image_id in image_to_show: scipy.misc.imsave(str(image_id) + ".png", val_image_batch_data[index]) logging.info("%d : %s" % (image_id, sentence)) pass pass score, _ = bleu_test.compute_score(caption_data, result) logging.info("Bleu1 %f " % (score[0])) logging.info("Bleu2 %f " % (score[1])) logging.info("Bleu3 %f " % (score[2])) logging.info("Bleu4 %f " % (score[3])) pass
wrd = {i: [sys_strs[i]]} rouge, _ = rouge_obj.compute_score(wtd, wrd) rouges.append(rouge) print(np.mean(rouges)) with open("%s-rouges.txt" % system, 'w') as outf: for r in rouges: outf.write(str(r) + '\n') for i in range(len(ref1_strs)): word_target_dict[i] = [ref1_strs[i], ref2_strs[i]] word_response_dict[i] = [sys_strs[i]] bleu_score, bleu_scores = bleu_obj.compute_score(word_target_dict, word_response_dict) bleu1_score, _, _, bleu4_score = bleu_score bleu1_scores, _, _, bleu4_scores = bleu_scores meteor_score, meteor_scores = meteor_obj.compute_score(word_target_dict, word_response_dict) rouge_score, rouge_scores = rouge_obj.compute_score(word_target_dict, word_response_dict) cider_score, cider_scores = cider_obj.compute_score(word_target_dict, word_response_dict) print("ROUGE-L: ", rouge_score) print("BLEU-1: ", bleu1_score) print("BLEU-4: ", bleu4_score) print("METEOR: ", meteor_score) print("CiDER: ", cider_score)
def evaluate(self, experiment_path: str, feature_file: str, feature_scp: str, caption_file: str, caption_output: str = "eval_output.json", score_output: str = "scores.txt", **kwargs): """kwargs: {'max_length': int, 'method': str, 'beam_size': int}""" dump = torch.load(os.path.join(experiment_path, "saved.pth"), map_location="cpu") # Load previous training config config = dump["config"] vocabulary = torch.load(config["vocab_file"]) model = self._get_model(config, vocabulary) model.load_state_dict(dump["model"]) # Some scaler (sklearn standardscaler) scaler = dump["scaler"] zh = config["zh"] model = model.to(self.device) dataset = SJTUDatasetEval(feature=feature_file, eval_scp=feature_scp, transform=scaler.transform) dataloader = torch.utils.data.DataLoader(dataset, shuffle=False, collate_fn=collate_fn((1, )), batch_size=32, num_workers=0) caption_df = pd.read_json(caption_file, dtype={"key": str}) if zh: key2refs = caption_df.groupby("key")["tokens"].apply( list).to_dict() else: key2refs = caption_df.groupby("key")["caption"].apply( list).to_dict() model.eval() key2pred = {} def _sample(engine, batch): with torch.no_grad(): model.eval() keys = batch[0] output = self._forward(model, batch, mode="sample", **kwargs) seqs = output["seqs"].cpu().numpy() for idx, seq in enumerate(seqs): caption = self._convert_idx2sentence(seq, vocabulary, zh) key2pred[keys[idx]] = [ caption, ] pbar = ProgressBar(persist=False, ascii=True) sampler = Engine(_sample) pbar.attach(sampler) sampler.run(dataloader) pred_df = [] for key, pred in key2pred.items(): pred_df.append({ "filename": key + ".wav", "caption": "".join(pred[0]) if zh else pred[0], "tokens": pred[0] if zh else pred[0].split() }) pred_df = pd.DataFrame(pred_df) pred_df.to_json(os.path.join(experiment_path, caption_output)) from pycocoevalcap.bleu.bleu import Bleu from pycocoevalcap.rouge.rouge import Rouge from pycocoevalcap.cider.cider import Cider from pycocoevalcap.meteor.meteor import Meteor from pycocoevalcap.spice.spice import Spice f = open(os.path.join(experiment_path, score_output), "w") scorer = Bleu(n=4, zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) for n in range(4): f.write("Bleu-{}: {:6.3f}\n".format(n + 1, score[n])) scorer = Rouge(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) f.write("ROUGE: {:6.3f}\n".format(score)) scorer = Cider(zh=zh) score, scores = scorer.compute_score(key2refs, key2pred) f.write("CIDEr: {:6.3f}\n".format(score)) if not zh: scorer = Meteor() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Meteor: {:6.3f}\n".format(score)) scorer = Spice() score, scores = scorer.compute_score(key2refs, key2pred) f.write("Spice: {:6.3f}\n".format(score)) f.close()
def evaluate(beam_size): """ Evaluation :param beam_size: beam size at which to generate captions for evaluation :return: BLEU-4 score """ # DataLoader loader = torch.utils.data.DataLoader(CaptionDataset( data_folder, data_name, 'TEST', transform=transforms.Compose([normalize])), batch_size=1, shuffle=True, num_workers=0, pin_memory=False) # TODO: Batched Beam Search # Therefore, do not use a batch_size greater than 1 - IMPORTANT! # Lists to store references (true captions), and hypothesis (prediction) for each image # If for n images, we have n hypotheses, and references a, b, c... for each image, we need - # references = [[ref1a, ref1b, ref1c], [ref2a, ref2b], ...], hypotheses = [hyp1, hyp2, ...] references = dict() hypotheses = dict() # For each image for j, (image, caps, caplens, allcaps) in enumerate( tqdm(loader, desc="EVALUATING AT BEAM SIZE " + str(beam_size))): k = beam_size # Move to GPU device, if available image = image.to(device) # (1, 3, 256, 256) attrs, encoder_out = encoder(image) attrs = attrs.expand(3, attrs_dim) enc_image_size = encoder_out.size(1) encoder_dim = encoder_out.size(3) encoder_out = encoder_out.view(1, -1, encoder_dim) num_pixels = encoder_out.size(1) encoder_out = encoder_out.expand(k, num_pixels, encoder_dim) x0 = decoder.init_x0(attrs) # Tensor to store top k previous words at each step; now they're just <start> k_prev_words = torch.LongTensor([[word_map['<start>']]] * k).to( device) # (k, 1) # Tensor to store top k sequences; now they're just <start> seqs = k_prev_words # (k, 1) # Tensor to store top k sequences' scores; now they're just 0 top_k_scores = torch.zeros(k, 1).to(device) # (k, 1) # Lists to store completed sequences and scores complete_seqs = list() complete_seqs_scores = list() # Start decoding step = 1 h1, c1, h2, c2 = decoder.init_hidden_state(attrs, encoder_out, zero=True) h1, c1 = decoder.decode_step1(x0, (h1, c1)) # s is a number less than or equal to k, because sequences are removed from this process once they hit <end> while True: embeddings = decoder.embedding(k_prev_words).squeeze( 1) # (s, embed_dim) h1, c1 = decoder.decode_step1(embeddings, (h1, c1)) awe, _ = decoder.attention(encoder_out, h1, h2) # gate = decoder.sigmoid(decoder.f_beta(h2)) # awe = gate * awe h2, c2 = decoder.decode_step2(torch.cat([embeddings, awe], dim=1), (h2, c2)) scores = decoder.fc2(decoder.dropout2(h2)) scores = F.log_softmax(scores, dim=1) # Add scores = top_k_scores.expand_as(scores) + scores # (s, vocab_size) # For the first step, all k points will have the same scores (since same k previous words, h, c) if step == 1: top_k_scores, top_k_words = scores[0].topk(k, 0, True, True) # (s) else: # Unroll and find top scores, and their unrolled indices # (s) 所有分数中最大的k个 top_k_scores, top_k_words = scores.view(-1).topk( k, 0, True, True) # Convert unrolled indices to actual indices of scores # 上面展开了,prev_word_inds得到哪些句子是概率最大的 prev_word_inds = top_k_words / vocab_size # (s) next_word_inds = top_k_words % vocab_size # (s) # Add new words to sequences seqs = torch.cat( [seqs[prev_word_inds], next_word_inds.unsqueeze(1)], dim=1) # (s, step+1) # Which sequences are incomplete (didn't reach <end>)? incomplete_inds = [ ind for ind, next_word in enumerate(next_word_inds) if next_word != word_map['<end>'] ] complete_inds = list( set(range(len(next_word_inds))) - set(incomplete_inds)) # Set aside complete sequences if len(complete_inds) > 0: complete_seqs.extend(seqs[complete_inds].tolist()) complete_seqs_scores.extend(top_k_scores[complete_inds]) k -= len(complete_inds) # reduce beam length accordingly # Proceed with incomplete sequences if k == 0: break seqs = seqs[incomplete_inds] h1 = h1[prev_word_inds[incomplete_inds]] c1 = c1[prev_word_inds[incomplete_inds]] h2 = h2[prev_word_inds[incomplete_inds]] c2 = c2[prev_word_inds[incomplete_inds]] encoder_out = encoder_out[prev_word_inds[incomplete_inds]] top_k_scores = top_k_scores[incomplete_inds].unsqueeze(1) k_prev_words = next_word_inds[incomplete_inds].unsqueeze(1) # Break if things have been going on too long if step > 50: break step += 1 i = complete_seqs_scores.index(max(complete_seqs_scores)) seq = complete_seqs[i] # References img_caps = allcaps[0].tolist() img_captions = list( map( lambda c: [ rev_word_map[w] for w in c if w not in { word_map['<start>'], word_map['<end>'], word_map[ '<pad>'] } ], img_caps)) # remove <start> and pads img_caps = [' '.join(c) for c in img_captions] # print(img_caps) references[str(j)] = img_caps # Hypotheses hypothesis = ([ rev_word_map[w] for w in seq if w not in {word_map['<start>'], word_map['<end>'], word_map['<pad>']} ]) hypothesis = [' '.join(hypothesis)] # print(hypothesis) hypotheses[str(j)] = hypothesis assert len(references) == len(hypotheses) # Calculate BLEU-1~BLEU4 scores m1 = Bleu() m2 = Meteor() m3 = Cider() m4 = Rouge() m5 = Spice() (score1, scores1) = m1.compute_score(references, hypotheses) (score2, scores2) = m2.compute_score(references, hypotheses) (score3, scores3) = m3.compute_score(references, hypotheses) (score4, scores4) = m4.compute_score(references, hypotheses) (score5, scores5) = m5.compute_score(references, hypotheses) return score1, score2, score3, score4, score5
def run_load_gap_filler(pretrained_filename, do_bleu=False, must_have_anp=False, copy_if_no_anp=False, replace_adj=False, get_human=False, semi_human=False): rnn = RNNModel() rnn.load_model(pretrained_filename) rnn.conf['VAL_SPLIT'] = RNNDataProvider.TEST if get_human: id_to_caps = pickle.load(open("coco_mturk/id_to_caps.pik", "rb")) rnn.build_model_core() rnn.load_val_dataset() rnn.build_sentence_generator() rnn.build_perplexity_calculator() #print rnn.sample_sentence(rnn.V_valid[0]) #print decoder_beamsearch2(rnn, rnn.V_valid[0]) #print decoder_beamsearch(rnn, rnn.V_valid[0]) #calculate_metric(rnn) #sys.exit(0) pos_sentence_res = [] pos_att_res = [] des_sentence_res = [] des_att_res = [] img_files = [] img_ids = [] id_to_sentences = {} seen_ids = set() if 'added_words' in rnn.conf: new_words = set([w[0] for w in rnn.conf['added_words']]) else: new_words = set() num_ignore = 0 num_not_ignore = 0 for idx in range(rnn.V_valid.shape[0]): img_file = rnn.dp.img_id_to_filename[rnn.Id_valid[idx]] img_id = rnn.Id_valid[idx] if img_id not in id_to_sentences: id_to_sentences[img_id] = [] #id_to_sentences[img_id].append(' '.join([rnn.dp.i2w[w] for w in rnn.X_valid[idx] if w != 0][::-1])) if replace_adj: id_to_sentences[img_id] = [ ' '.join(do_replace_adj(rnn.dp.tokens[i])[::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] elif get_human: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] np.random.shuffle(id_to_sentences[img_id]) print(len(id_to_sentences[img_id])) human_sen_pos = id_to_sentences[img_id].pop() print(len(id_to_sentences[img_id])) if not id_to_sentences[img_id]: continue else: id_to_sentences[img_id] = [ ' '.join(rnn.dp.tokens[i][::-1]) for i in rnn.dp.img_id_to_tokens[img_id] ] #print id_to_sentences[img_id] if img_id in seen_ids: continue seen_ids.add(img_id) if get_human and not semi_human: pos_sen = human_sen_pos.split()[::-1] np.random.shuffle(id_to_caps[img_id]) des_sen = id_to_caps[img_id][0][::-1] else: lp, pos_sen, pos_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=1.0, beam_size=5) lp, des_sen, des_att = decoder_beamsearch_with_attention( rnn, rnn.V_valid[idx], senti=-1.0, beam_size=5) pos_sen = pos_sen[:-1] des_sen = des_sen[:-1] #des_att = des_att[:-1] pos_att = pos_att[:-1] #pos_sen, pos_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([1.0], dtype=theano.config.floatX)) pos_att = np.array(pos_att) pos_att = pos_att.flatten() #des_att = np.array(des_att) #des_att = des_att.flatten() des_att = np.zeros((len(des_sen), )) #pos_att = np.zeros((len(pos_sen),)) if must_have_anp: if not sentence_has_anp(pos_sen[::-1]): num_ignore += 1 continue num_not_ignore += 1 if copy_if_no_anp: if not sentence_has_anp(pos_sen[::-1]): pos_sen = des_sen if replace_adj: pos_sen = do_replace_adj(pos_sen[::-1])[::-1] des_sen = do_replace_adj(des_sen[::-1])[::-1] #des_sen, des_att = rnn.get_sentence(rnn.V_valid[idx], senti=np.array([-1.0], dtype=theano.config.floatX)) new_pos_sen = [] for vv, a in zip(pos_sen, pos_att): out = vv col = "" if a > 0.75: col = "#FF3300" elif a > 0.5: col = "#FF5C33" elif a > 0.25: col = "#FF8566" #if a > 0.75: # col = "#33CC33"# "#3366FF" #elif a > 0.5: # col = "#70DB70" #"#5C85FF" #elif a > 0.25: # col = "#ADEBAD" #"#85A3FF" if col: out = "<font style='background-color: %s'>%s</font>" % (col, vv) new_pos_sen.append(out) pos_sen = new_pos_sen print(pos_sen) print(pos_att) print(des_sen) print_it = False for v in pos_sen: if v in new_words: print_it = True if print_it: for x in zip(pos_sen, pos_att)[::-1]: print(x[0], end=' ') print("") #for x in zip(pos_sen, pos_att)[::-1]: # print x[0], #print "" #for x in zip(des_sen, des_att)[::-1]: # print x[0], #print "\n" pos_att = pos_att[:len(pos_sen)] des_att = des_att[:len(des_sen)] pos_sentence_res.append(pos_sen[::-1]) pos_att_res.append(np.exp(pos_att[::-1])) des_sentence_res.append(des_sen[::-1]) des_att_res.append(np.exp(des_att[::-1])) img_files.append(img_file) img_ids.append(img_id) output = { 'pos_sen': pos_sentence_res, 'pos_att': pos_att_res, 'des_sen': des_sentence_res, 'des_att': des_att_res, 'img_files': img_files, 'img_ids': img_ids } pickle.dump(output, open("output_data/sen_att_pos_01.pik", "wb"), protocol=2) if must_have_anp: print("Must have ANP % removed:", num_ignore / float(num_not_ignore) * 100.0) print("getting Positive perplexity") print(rnn.get_val_perplexity()) print("got perplexity") print("getting Descriptive perplexity") print(rnn.get_val_perplexity(base=True)) print("got perplexity") gts = {} res = {} fout = open("eval/output_pos", "w") for line, iid in zip(pos_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res: res[iid] = [] res[iid].append(' '.join(line)) fout.close() res_des = {} fout = open("eval/output_des", "w") for line, iid in zip(des_sentence_res, img_ids): fout.write(' '.join(line) + '\n') if iid not in res_des: res_des[iid] = [] res_des[iid].append(' '.join(line)) fout.close() for i in range(3): fout = open("eval/reference%d" % i, "w") for cid in img_ids: if cid not in gts: gts[cid] = [] if len(id_to_sentences[cid]) > i: gts[cid].append(id_to_sentences[cid][i]) fout.write(id_to_sentences[cid][i] + "\n") else: fout.write("\n") fout.close() bleu = Bleu() #for i in gts.keys()[:10]: # print gts[i] # print res_des[i] # print res[i] # print "" total_ref_sentences = 0 for i in list(gts.keys()): total_ref_sentences += len(gts[i]) print("Total ref sentences:", total_ref_sentences) print("Bleu:") print("Positive:", bleu.compute_score(gts, res)[0]) print("Descriptive:", bleu.compute_score(gts, res_des)[0]) rouge = Rouge() print("Rouge:") print("Positive:", rouge.compute_score(gts, res)[0]) print("Descriptive:", rouge.compute_score(gts, res_des)[0]) cider = Cider() print("Cider:") print("Positive:", cider.compute_score(gts, res)[0]) print("Descriptive:", cider.compute_score(gts, res_des)[0]) meteor = Meteor() print("Meteor:") print("Positive:", meteor.compute_score(gts, res)[0]) print("Descriptive:", meteor.compute_score(gts, res_des)[0])