def evaluate_answer(self, session, dataset, sample=100, log=False): f1 = 0. em = 0. num_iter = 0 flag = False for q_id, p_id, a_span, paragraph in load_dataset(dataset): q_batch, m_q_batch = zip( *pad_sequences(q_id, FLAGS.question_output_size)) p_batch, m_p_batch = zip( *pad_sequences(p_id, FLAGS.paragraph_output_size)) a_s, a_e = self.answer(session, q_batch, p_batch, m_q_batch, m_p_batch) for i in range(len(a_span)): answer = paragraph[i][a_s[i]:a_e[i] + 1] true_answer = paragraph[i][a_span[i][0]:a_span[i][1] + 1] f1 += f1_score(answer, true_answer) em += exact_match_score(answer, true_answer) num_iter += 1 if num_iter >= sample: flag = True break if flag: break f1 = f1 / sample em = em / sample if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1, em
def evaluate_answer(self, session, dataset, sample=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ batch = dataset if sample is None: sample = len(dataset) else: # If we only select a subset of the data random_indices = [ random.randint(0, len(dataset)) for _ in range(sample) ] batch = [dataset[idx] for idx in random_indices] question_batch, context_batch, question_mask_batch, context_mask_batch, start_answer_batch, end_answer_batch = zip( *batch) a_s, a_e = self.answer( session, batch) # These are both arrays of length sample size true_a_s = np.argmax(start_answer_batch, axis=1) true_a_e = np.argmax(end_answer_batch, axis=1) print("predicted a_s: ", a_s) print("predicted a_e: ", a_e) print("true start answer: ", true_a_s) print("true end answer: ", true_a_e) answers = [ context_batch[i][a_s[i]:a_e[i] + 1] for i in range(len(a_s)) ] true_answers = [ context_batch[i][true_a_s[i]:true_a_e[i] + 1] for i in range(len(true_a_s)) ] f1s = [] ems = [] for i in range(len(true_answers)): answer = answers[i] true_answer = true_answers[i] f1_one_example = f1_score(answer, true_answer) f1s.append(f1_one_example) em_one_example = exact_match_score(answer, true_answer) ems.append(em_one_example) f1 = np.sum(f1s) / float(sample) em = np.sum(ems) / float(sample) if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1s, ems
def evaluate_answer(self, session, dataset, save=False): # Evaluate the model's performance using the harmonic mean of F1 and # Exact Match (EM) with the set of true answer labels. res = [] prob = [] for j in range(0, len(dataset)): sample = create_minibatch(dataset, 1, j) s, e, p1, p2 = self.answer(session, sample) _, p, a, _, _ = sample idx = list(p[0]) res.append((idx[s[0]:e[0] + 1], idx[a[0][0]:a[0][1] + 1])) # save prediciton probability for future use if save: prob.append((p1.tolist(), p2.tolist(), p.tolist(), a.tolist())) f1 = 0. em = 0. for p, g in res: text_p = " ".join(str(i) for i in p) text_g = " ".join(str(i) for i in g) f1 += f1_score(text_p, text_g) em += exact_match_score(text_p, text_g) return f1 / len(dataset), em / len(dataset), prob
def get_spans(self, session, context_path, qn_path, ans_path, dataset, num_samples=0): """ Sample from the provided (train/dev) set. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. Returns: begin_prob, end_prob: The average probabilities the sampled examples. """ total_start_dists = [] total_end_dists = [] f1_em_scores = [] example_num = 0 for batch in get_batch_generator( self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False, random=False): pred_start_dists, pred_end_dists = self.get_prob_dists(session, batch) pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size pred_start_dists = pred_start_dists.tolist() # list length batch_size pred_end_dists = pred_end_dists.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer # Important: batch.context_tokens contains the original words (no UNKs) # You need to use the original no-UNK version when measuring F1/EM pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_em_scores.append((f1,em)) # print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break # Convert the start and end positions to lists length batch_size total_end_dists += pred_end_dists total_start_dists += pred_start_dists if num_samples != 0 and example_num >= num_samples: break return np.asarray(total_start_dists), np.asarray(total_end_dists), np.asarray(f1_em_scores)
def evaluate_answer(self, session, dataset, vocab, sample=400, log=False): f1 = 0. em = 0. N = len(dataset) sampleIndices = np.random.choice(N, sample, replace=False) evaluate_set = [dataset[i] for i in sampleIndices] predicts = self.predict_on_batch(session, evaluate_set) for example, (start, end) in zip(evaluate_set, predicts): q, _, c, _, (true_s, true_e) = example # print (start, end, true_s, true_e) context_words = [vocab[w] for w in c] true_answer = ' '.join(context_words[true_s:true_e + 1]) if start <= end: predict_answer = ' '.join(context_words[start:end + 1]) else: predict_answer = '' f1 += f1_score(predict_answer, true_answer) em += exact_match_score(predict_answer, true_answer) f1 = 100 * f1 / sample em = 100 * em / sample if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1, em
def get_eval(self, sess, dataset, batch_size, sample=True): ''' if sample, take first batch only ''' f1 = em = total = 0 for i, batch in enumerate( get_minibatches(dataset, batch_size, shuffle=True)): p, q, p_len, q_len, a_s, a_e, p_raw = zip(*batch) loss, norm, ys, ye = self.eval_batch(sess, p, q, p_len, q_len, a_s, a_e) a_s_pred = np.argmax(ys, axis=1) a_e_pred = np.argmax(ye, axis=1) for i in range(len(batch)): #predicted a_s and a_e s_pred = a_s_pred[i] e_pred = a_e_pred[i] #ground truth lables a_raw = ' '.join(p_raw[i][a_s[i]:a_e[i] + 1]) pred_raw = ' '.join(p_raw[i][s_pred:e_pred + 1]) f1 += f1_score(pred_raw, a_raw) em += exact_match_score(pred_raw, a_raw) total += 1 if sample: break em = 100.0 * em / total f1 = 100.0 * f1 / total return (f1, em, loss, norm)
def evaluate_answer(self, session, dataset, sample=100, log=False, datatype='val'): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ f1 = 0. em = 0. fname = "../.." with open( os.path.join(self.config.flag.data_dir, "%s.context" % datatype)) as f: data_paragraph = [line.split() for line in f.read().splitlines()] with open( os.path.join(self.config.flag.data_dir, "%s.answer" % datatype)) as f: data_answer = [line.split() for line in f.read().splitlines()] ground_truth = (data_paragraph, data_answer) i = 0 while i < sample: preds = self.answer( session, (dataset[datatype][0][i:i + self.config.flag.batch_size], dataset[datatype][1][i:i + self.config.flag.batch_size], dataset[datatype][2][i:i + self.config.flag.batch_size])) for j in range(len(preds[0])): prediction = ' '.join( ground_truth[0][i][preds[0][j]:(preds[1][j] + 1)]) gt = ' '.join(ground_truth[1][i]) f1_instance = f1_score(prediction, gt) em_instance = exact_match_score(prediction, gt) em = em + em_instance f1 = f1 + f1_instance i += 1 em = 100 * em / float(sample) f1 = 100 * f1 / float(sample) if log: logging.info( "Output for '{}' dataset - F1: {}, EM: {}, for {} samples". format(datatype, f1, em, sample)) return f1, em
def check_f1_em(self, context_path, qn_path, ans_path, dataset, num_samples=1000): f1_total = 0. em_total = 0. example_num = 0 for batch in data_batcher.get_batch_generator(self.word2id, self.id2idf, context_path, qn_path, ans_path, self.batch_size, context_len=300, question_len=30, discard_long=False): pred_start_pos, pred_end_pos = self.get_predictions(batch) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in \ enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer # Important: batch.context_tokens contains the original words (no UNKs) # You need to use the original no-UNK version when measuring F1/EM pred_ans_tokens = batch.context_tokens[ex_idx][ pred_ans_start:pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num return f1_total, em_total
def check_f1_em(self, model, dataset, num_samples=100, print_to_screen=False): logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) if dataset == "train": context_path, qn_path, ans_path = self.train_context_path, self.train_qn_path, self.train_ans_path elif dataset == "dev": context_path, qn_path, ans_path = self.dev_context_path, self.dev_qn_path, self.dev_ans_path else: raise ('dataset is not defined') f1_total = 0. em_total = 0. example_num = 0 tic = time.time() for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, config.batch_size, context_len=config.context_len, question_len=config.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.test_one_batch(batch, model) pred_start_pos = pred_start_pos.tolist() pred_end_pos = pred_end_pos.tolist() for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) \ in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) true_answer = " ".join(true_ans_tokens) f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num toc = time.time() logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic)) return f1_total, em_total
def argmax_eval(data): em = 0. f1 = 0. for start, end, paragraph, answer in data: text_p = " ".join(paragraph[np.argmax(start):np.argmax(end) + 1]) text_g = " ".join(paragraph[answer[0]:answer[1] + 1]) f1 += f1_score(text_p, text_g) em += exact_match_score(text_p, text_g) print("argmax EM: {:.5f}, F1: {:.5f}".format(em / len(data), f1 / len(data)))
def eval_sentence(self, preds_ind, gold_ind, sentence): pred_vecs = [s for s, p in zip(sentence, preds_ind) if p] gold_vecs = [s for s, g in zip(sentence, gold_ind) if g] pred_sent = ' '.join(self.vocab[i] for i in pred_vecs) gold_sent = ' '.join(self.vocab[i] for i in gold_vecs) f1 = new_f1_score(pred_sent, gold_sent) em = exact_match_score(pred_sent, gold_sent) return f1, em, pred_sent, gold_sent
def evaluate_answer(self, session, dataset, context, sample=100, log=False, eval_set='train'): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ if sample is None: sampled = dataset sample = len(dataset) else: #np.random.seed(0) sampled = dataset[np.random.choice(dataset.shape[0], sample)] a_s, a_e = self.answer(session, sampled) f1 = [] em = [] #embed() sampled = sampled.T for i in range(len(sampled[0])): pred_words = ' '.join(context[i][a_s[i]:a_e[i] + 1]) actual_words = ' '.join( context[i][sampled[2][i][0]:sampled[2][i][1] + 1]) # print('I:',i) # print ("INDICES",a_s[i],a_e[i]) # print ("PRED_WORDS:",pred_words) # print ("ACTUAL WORD",actual_words) f1.append(f1_score(pred_words, actual_words)) # print('F1:',f1) em.append(exact_match_score(pred_words, actual_words)) # print('EM:',em) # print (" ") if log: logging.info("{},F1: {}, EM: {}, for {} samples".format( eval_set, np.mean(f1), None, sample)) # f1=sum(f1)/len(f1) # em=sum(em)/len(em) return f1, em
def EM_F1(pos_scores, batch_target): pos = [np.argmax(x, axis=1) for x in pos_scores] predict_ans = normalize_ans(pos) ans = normalize_ans(batch_target) em = f1 = 0 for prediction, ground_truth in zip(predict_ans, ans): em += exact_match_score(prediction, ground_truth) f1 += f1_score(prediction, ground_truth) em = 100.0 * em / len(ans) f1 = 100.0 * f1 / len(ans) return em, f1
def evaluate_answer(self, session, dataset, rev_vocab, sample=20, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ sample = min(sample, len(dataset)) overall_f1 = 0. overall_em = 0. minibatch_size = 100 num_batches = int(sample / minibatch_size) for batch in range(0, num_batches): start = batch * minibatch_size end = min(len(dataset), start + minibatch_size) h_s, h_e, _ = self.decode(session, dataset[start:end]) for i in range(minibatch_size): a_s = np.argmax(h_s[i]) a_e = np.argmax(h_e[i]) if a_s > a_e: k = a_e a_e = a_s a_s = k sample_dataset = dataset[start + i] context = sample_dataset[0] (a_s_true, a_e_true) = sample_dataset[6] predicted_answer = self.formulate_answer( context, rev_vocab, a_s, a_e) true_answer = self.formulate_answer(context, rev_vocab, a_s_true, a_e_true) f1 = f1_score(predicted_answer, true_answer) overall_f1 += f1 if exact_match_score(predicted_answer, true_answer): overall_em += 1 average_f1 = overall_f1 / sample overall_em = overall_em / sample logging.info("F1: {}, EM: {}, for {} samples\n".format( average_f1, overall_em, sample)) return overall_f1, overall_em
def evaluate_answer(self, session, qs, cs, sample=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ print("Evaluating Answers") f1 = 0. em = 0. text_file = open("./data/squad/train.context", "r") inputs_c = text_file.read().split("\n") #print(inputs_c) context = [] text_file.close() for i in range(sample): words = inputs_c[i].split() context.append(words) prediction = [] #need to define self.true somewhere ground_truth = [] self.a_s, self.a_e = self.answer(session, qs, cs) #these functions are defined in evaluate.py. They are already written and should not be changed #Not sure if these indices are the best way to access these for i in range(sample): prediction.append(context[i][self.a_s, self.a_e + 1]) ground_truth.append(context[i][self.true_s, self.true_e + 1]) f1 = f1 + f1_score(prediction[i], ground_truth[i]) / sample em = em + exact_match_score(prediction[i], ground_truth[i]) / sample if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) print("With Print, F1: {}, EM: {}, for {} samples".format( f1, em, sample)) #Might be redundent return f1, em
def eval_text(): '''figure out how evaluation works.''' with open(an_path) as f: raw_data = [line.strip() for line in f.readlines()] print(raw_data[:10]) l = ['Corpus','Juris', 'canonici'] s = ' '.join(l) print(s) print(f1_score(s, raw_data[0])) print(exact_match_score(s, raw_data[0]) / 1.0)
def evaluate_answer(self, session, dataset, context, sample=100, log=False, eval_set='train'): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ if sample is None: sampled = dataset sample = len(dataset[0]) else: #np.random.seed(0) inds = np.random.choice(len(dataset[0]), sample) sampled = [elem[inds] for elem in dataset] context = [context[i] for i in inds] a_s, a_e = self.answer(session, sampled) context_ids, question_ids, answer_spans, ctx_mask, q_mask = sampled f1 = [] em = [] # #embed() for i in range(len(sampled[0])): pred_words = ' '.join(context[i][a_s[i]:a_e[i] + 1]) actual_words = ' '.join( context[i][answer_spans[i][0]:answer_spans[i][1] + 1]) f1.append(f1_score(pred_words, actual_words)) cur_em = exact_match_score(pred_words, actual_words) em.append(float(cur_em)) if log: logging.info("{},F1: {}, EM: {}, for {} samples".format( eval_set, np.mean(f1), np.mean(em), sample)) return np.mean(f1), np.mean(em)
def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. """ logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. example_num = 0 tic = time.time() for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch) for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em # Optionally pretty-print if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num toc = time.time() logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic)) return f1_total, em_total
def evaluate_answer(self, session, dataset, rev_vocab, sample=100, log=False): """ :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ our_answers = [] their_answers = [] eval_set = random.sample(dataset, sample) batches, num_batches = get_batches(eval_set, self.FLAGS.batch_size) #for question, question_mask, paragraph, paragraph_mask, span, true_answer in eval_set: for batch in batches: val_questions, val_question_masks, val_paragraphs, val_paragraph_masks, _, val_true_answers = zip(*batch) a_s, a_e = self.answer(session, val_questions, val_paragraphs, val_question_masks, val_paragraph_masks) for s, e, paragraph in zip(a_s, a_e, val_paragraphs): token_answer = paragraph[s : e + 1] #The slice of the context paragraph that is our answer sentence = [rev_vocab[token] for token in token_answer] our_answer = ' '.join(word for word in sentence) our_answers.append(our_answer) for true_answer in val_true_answers: their_answer = ' '.join(word for word in true_answer) their_answers.append(their_answer) assert(len(our_answers) == len(their_answers)) f1 = exact_match = total = 0 answer_tuples = zip(their_answers, our_answers) for ground_truth, prediction in answer_tuples: total += 1 exact_match += exact_match_score(prediction, ground_truth) f1 += f1_score(prediction, ground_truth) exact_match = 100.0 * exact_match / total f1 = 100.0 * f1 / total if log: logging.info("F1: {}, EM: {}, for {} samples".format(f1, exact_match, sample)) logging.info("Samples:") for i in xrange(min(10, sample)): ground_truth, our_answer = answer_tuples[i] logging.info("Ground Truth: {}, Our Answer: {}".format(ground_truth, our_answer)) return f1, exact_match
def evaluate(self, answers, gold): ''' calculates f1 and em, given a batch of guesses and gold data ''' num = len(answers) assert num == len(gold) f1 = 0. em = 0. for i in xrange(num): f1 += f1_score(answers[i], gold[i]) emm = exact_match_score(answers[i], gold[i]) em += emm #print(i, str(emm)[0], '|', answers[i], '|', gold[i]) return (f1/num, em/num)
def evaluate_answer(self, session, dataset_address, sample=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ ######### the f1_score and exact_match_score functions defined work only with strings ######### need to write new ones that work with lists like below f1 = 0. em = 0. dataset, num_samples = get_sample( dataset_address, self.FLAGS.context_paragraph_max_length, sample) test_questions, test_paragraphs, test_start_answers, test_end_answers = dataset predictions = self.answer(session, test_paragraphs, test_questions) for i in range(num_samples): answer_beg = test_start_answers[i][0] # this is a list of length 1 answer_end = test_end_answers[i][0] # same answer_str_list = [ str(test_paragraphs[i][j]) for j in range(answer_beg, answer_end + 1) ] true_answer = ' '.join(answer_str_list) prediction_str_list = [ str(test_paragraphs[i][j]) for j in range(predictions[i][0], predictions[i][1] + 1) ] prediction_string = ' '.join(prediction_str_list) f1 += f1_score(prediction_string, true_answer) em += exact_match_score(prediction_string, true_answer) f1 = 1.0 * f1 / sample em = 1.0 * em / sample if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1, em
def evaluate(args): opt = json.load(open('models/config.json', 'r'))['rnet'] config = tf.ConfigProto(inter_op_parallelism_threads=1,intra_op_parallelism_threads=1) config.gpu_options.allow_growth = True sess = tf.Session(config=config) saved_model = args.model_path EM = 0.0 F1 = 0.0 with sess.as_default(): print('Reading data') dp = preprocess.read_data('dev', opt) it, enqueue_op = dp.provide(sess) rnet_model = model.RNet(opt) loss, pt, accu = rnet_model.build_model(it) dequeued_p, asi, aei = it['p'], it['asi'], it['aei'] # restore model print('restoring model...') saver = tf.train.Saver() saver.restore(sess, saved_model) # start feeding threads coord = tf.train.Coordinator() threads = [] for i in range(opt['num_threads']): t = Thread(target=feeder, args=(dp, sess, enqueue_op, coord, i, args.debug)) t.start() threads.append(t) # start prediction print('Prediction starts') num_batch = int(dp.num_sample/dp.batch_size) for j in tqdm(range(num_batch)): pt_val, p_batch, asi_batch, aei_batch = sess.run([pt, dequeued_p, asi, aei]) f1, em = 0.0, 0.0 for k in range(len(p_batch)): paragraph = p_batch[k][0].decode('utf8').split(' ') true_start, true_end = asi_batch[k][0], aei_batch[k][0] pred_start, pred_end = pt_val[k][0], pt_val[k][1] pred_tokens = paragraph[pred_start:(pred_end+1)] true_tokens = paragraph[true_start:(true_end+1)] f1 += f1_score(' '.join(pred_tokens), ' '.join(true_tokens)) em += exact_match_score(' '.join(pred_tokens), ' '.join(true_tokens)) print('{}th batch | f1: {} | em: {}'.format(j, f1/len(p_batch), em/len(p_batch))) F1 += f1 EM += em print('Evaluation complete, F1 score: {}, EM score: {}'.format(F1/dp.num_sample, EM/dp.num_sample))
def evaluate_answer(self, sess, dataset, sample=100, log=False, mode="val"): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param sess: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ f1 = 0. em = 0. len_data = len(dataset[mode][-1]) indexer = random.sample(xrange(len_data), sample) # 1. Pad data p, q, span, pw, qw, ans = [[component[i] for i in indexer] for component in dataset[mode]] p, mask_p, actual_p = pad_input(p, Config.max_p_len) q, mask_q, actual_q = pad_input(q, Config.max_q_len) begin, end = zip(*span) # get answer a_s, a_e = self.answer_all(sess, [p, mask_p, actual_p, q, mask_q, actual_q]) for i in range(sample): # ground truth gt = ' '.join(ans[i]) # prediction pred = ' '.join(pw[i][a_s[i]:a_e[i] + 1]) em += exact_match_score(pred, gt) f1 += f1_score(pred, gt) if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1 / sample, em / sample, sample)) for a, b in zip(zip(begin, end), zip(a_s, a_e))[:6]: print("Actual: {} Predicted: {}".format(a, b)) return f1 / sample, em / sample
def evaluate_answer(self, session, dataset, vocab, sample=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ f1 = 0. em = 0. totExamples = len(dataset) examplesToEvaluate = np.random.choice(totExamples, sample) for i in examplesToEvaluate: true_a_s = int(dataset[i]["span"][0]) true_a_e = int(dataset[i]["span"][1]) predicted_a_s, predicted_a_e = self.answer(session, dataset[i]) paragraphWords = [vocab[j] for j in dataset[i]["context"]] ground_truth = paragraphWords[true_a_s:true_a_e + 1] prediction = paragraphWords[predicted_a_s:predicted_a_e + 1] # Turn into a sentence ground_truth = ' '.join(ground_truth) prediction = ' '.join(prediction) # Evaluate em += float(exact_match_score(prediction, ground_truth)) f1 += f1_score(prediction, ground_truth) f1 /= sample em /= sample if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1, em
def evaluate_answer(self, session, data, rev_vocab, sample_num=200): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :return: """ overall_f1 = 0. overall_em = 0. eval_batch = [ data[i] for i in np.random.choice(len(data), sample_num, replace=False) ] eval_batch = list(zip(*eval_batch)) # unzip the list a_s_vec, a_e_vec = self.answer(session, eval_batch) for (a_s, a_e, context, a_true) in zip(a_s_vec, a_e_vec, eval_batch[0], eval_batch[6]): if a_s > a_e: tmp = a_s a_s = a_e a_e = tmp predicted_answer = self.formulate_answer(context, rev_vocab, a_s, a_e) true_answer = self.formulate_answer(context, rev_vocab, a_true[0], a_true[1]) f1 = f1_score(predicted_answer, true_answer) overall_f1 += f1 if exact_match_score(predicted_answer, true_answer): overall_em += 1 average_f1 = overall_f1 / sample_num overall_em /= sample_num # logging.info("F1: {}, EM: {}, for {} samples\n".format(average_f1, overall_em, sample_num)) return average_f1, overall_em
def evaluate_answer(self, session, dataset, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ f1 = 0. em = 0. (question, par, labels) = dataset num_sample = len(labels) # why not batch for index in range(0, num_sample): a_s, a_e = self.answer(session, question[index], par[index], labels[index]) # print(a_s,a_e) answers = par[index][0][a_s: a_e + 1] p_s, p_e = labels[index] # print(p_s,p_e) true_answer = par[index][0][p_s: p_e + 1] answers = " ".join(str(a) for a in answers) true_answer = " ".join(str(ta) for ta in true_answer) # print(answers) # print(true_answer) f1 += f1_score(answers, true_answer) # print('@@@@@@@@@@@@@@@@@@@') em += exact_match_score(' '.join(str(a) for a in answers), ' '.join(str(ta) for ta in true_answer)) # logging.info("answers %s, true_answer %s" % (answers, true_answer)) f1 /= num_sample em /= num_sample if log: logging.info("F1: {:.2%}, EM: {:.2%}, for {} samples".format(f1, em, num_sample)) return f1, em
def evaluate_answer(self, session, dataset, samples=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param log: whether we print to std out stream :return: """ samples = min(samples, len(dataset[0])) c_ids, c_len, q_ids, q_len, span = dataset f1 = 0. em = 0. for index in range(samples): a_s, a_e = self.answer( session, (c_ids[index], c_len[index], q_ids[index], q_len[index])) answers = c_ids[index][a_s:a_e + 1] p_s, p_e = span[index] true_answer = c_ids[index][p_s:p_e + 1] answers = " ".join(str(a) for a in answers) true_answer = " ".join(str(ta) for ta in true_answer) f1 += f1_score(answers, true_answer) em += exact_match_score(' '.join(str(a) for a in answers), ' '.join(str(ta) for ta in true_answer)) #logging.info("answers %s, true_answer %s" % (answers, true_answer)) f1 /= samples em /= samples if log: logging.info("F1: {:.2%}, EM: {:.2%}, for {} samples".format( f1, em, samples)) return f1, em
def evaluate_answer(self, session, dataset, sample=100, log=False): """ Our dataset format: a list of (context, question, begin, end) Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ if len(dataset) > sample: dataset = random.sample(dataset, sample) f1, em = 0., 0. for context, question, begin, end in dataset: a_s, a_e = self.answer(session, context, question) a_s = min(a_s, len(context) - 1) a_e = min(a_e, len(context) - 1) if a_s > a_e: a_s, a_e = a_e, a_s prediction = context[a_s:(a_e + 1)] prediction = ' '.join([str(x) for x in prediction]) ground_truth = context[begin:(end + 1)] ground_truth = ' '.join([str(x) for x in ground_truth]) f1 += f1_score(prediction, ground_truth) em += exact_match_score(prediction, ground_truth) f1 = f1 * 100 / len(dataset) em = em * 100 / len(dataset) if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1, em
def evaluate_answer(self, session, dataset, sample=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ f1 = 0. em = 0. q_batch, q_lens, p_batch, p_lens, s_label_batch, e_label_batch = make_eval_batch( dataset, sample) test_x = (q_batch, q_lens, p_batch, p_lens) pred_s, pred_e = self.answer(session, test_x) f1 = [ f1_score(p_batch[i][pred_s[i]:pred_e[i]], p_batch[i][s_label_batch[i]:e_label_batch[i]]) for i in range(sample) ] em = [ exact_match_score(p_batch[i][pred_s[i]:pred_e[i]], p_batch[i][s_label_batch[i]:e_label_batch[i]]) for i in range(sample) ] if log: logging.info("F1: {}, EM: {}, for {} samples".format( f1, em, sample)) return f1, em
def evaluate_answer(self, session, dataset_train, dataset_val, sample=100, log=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ f1, em = 0., 0. # Sample each for half of total samples feed_data, ground_truth = get_sampled_data(dataset_train, dataset_val, self.context_length, self.question_length, sample=sample) for i, d in enumerate(feed_data): a_s, a_e = self.answer(session, (d[0], d[1])) answer = d[0][0].flatten()[int(a_s):int(a_e) + 1].tolist() truth = ' '.join([str(s) for s in ground_truth[i]]) ans = ' '.join([str(s) for s in answer]) f1 += f1_score(ans, truth) / sample if exact_match_score(ans, truth): em += 1. / sample if log: logging.info("F1: {}, EM: {}%, for {} samples".format( f1, em * 100, sample)) return f1, em
def search_eval(data, max_span=15, op="+"): em = 0. f1 = 0. for start, end, paragraph, answer in data: s, e, prob = 0, 0, 0 for i in range(len(start)): for j in range(min(max_span, len(end) - i)): if op == "+": if start[i] + end[i + j] > prob: prob = start[i] + end[i + j] s, e = i, i + j if op == "*": if start[i] * end[i + j] > prob: prob = start[i] * end[i + j] s, e = i, i + j text_p = " ".join(paragraph[s:e + 1]) text_g = " ".join(paragraph[answer[0]:answer[1] + 1]) f1 += f1_score(text_p, text_g) em += exact_match_score(text_p, text_g) print("search EM: {:.5f}, F1: {:.5f} (max_span={}, op={})".format( em / len(data), f1 / len(data), max_span, op))
def check_f1_em(self, session, context_path, qn_path, ans_path, dataset, num_samples=100, print_to_screen=False): """ Sample from the provided (train/dev) set. For each sample, calculate F1 and EM score. Return average F1 and EM score for all samples. Optionally pretty-print examples. Note: This function is not quite the same as the F1/EM numbers you get from "official_eval" mode. This function uses the pre-processed version of the e.g. dev set for speed, whereas "official_eval" mode uses the original JSON. Therefore: 1. official_eval takes your max F1/EM score w.r.t. the three reference answers, whereas this function compares to just the first answer (which is what's saved in the preprocessed data) 2. Our preprocessed version of the dev set is missing some examples due to tokenization issues (see squad_preprocess.py). "official_eval" includes all examples. Inputs: session: TensorFlow session qn_path, context_path, ans_path: paths to {dev/train}.{question/context/answer} data files. dataset: string. Either "train" or "dev". Just for logging purposes. num_samples: int. How many samples to use. If num_samples=0 then do whole dataset. print_to_screen: if True, pretty-prints each example to screen Returns: F1 and EM: Scalars. The average across the sampled examples. """ logging.info("Calculating F1/EM for %s examples in %s set..." % (str(num_samples) if num_samples != 0 else "all", dataset)) f1_total = 0. em_total = 0. example_num = 0 tic = time.time() # Note here we select discard_long=False because we want to sample from the entire dataset # That means we're truncating, rather than discarding, examples with too-long context or questions for batch in get_batch_generator(self.word2id, context_path, qn_path, ans_path, self.FLAGS.batch_size, context_len=self.FLAGS.context_len, question_len=self.FLAGS.question_len, discard_long=False): pred_start_pos, pred_end_pos = self.get_start_end_pos(session, batch) # Convert the start and end positions to lists length batch_size pred_start_pos = pred_start_pos.tolist() # list length batch_size pred_end_pos = pred_end_pos.tolist() # list length batch_size for ex_idx, (pred_ans_start, pred_ans_end, true_ans_tokens) in enumerate(zip(pred_start_pos, pred_end_pos, batch.ans_tokens)): example_num += 1 # Get the predicted answer # Important: batch.context_tokens contains the original words (no UNKs) # You need to use the original no-UNK version when measuring F1/EM pred_ans_tokens = batch.context_tokens[ex_idx][pred_ans_start : pred_ans_end + 1] pred_answer = " ".join(pred_ans_tokens) # Get true answer (no UNKs) true_answer = " ".join(true_ans_tokens) # Calc F1/EM f1 = f1_score(pred_answer, true_answer) em = exact_match_score(pred_answer, true_answer) f1_total += f1 em_total += em # Optionally pretty-print if print_to_screen: print_example(self.word2id, batch.context_tokens[ex_idx], batch.qn_tokens[ex_idx], batch.ans_span[ex_idx, 0], batch.ans_span[ex_idx, 1], pred_ans_start, pred_ans_end, true_answer, pred_answer, f1, em) if num_samples != 0 and example_num >= num_samples: break if num_samples != 0 and example_num >= num_samples: break f1_total /= example_num em_total /= example_num toc = time.time() logging.info("Calculating F1/EM for %i examples in %s set took %.2f seconds" % (example_num, dataset, toc-tic)) return f1_total, em_total
def evaluate_answer(self, session, dataset, answers, rev_vocab, set_name='val', training=False, log=False, sample=(100, 100), sendin=None, ensemble=False): """ Evaluate the model's performance using the harmonic mean of F1 and Exact Match (EM) with the set of true answer labels This step actually takes quite some time. So we can only sample 100 examples from either training or testing set. :param session: session should always be centrally managed in train.py :param dataset: a representation of our data, in some implementations, you can pass in multiple components (arguments) of one dataset to this function :param sample: how many examples in dataset we look at :param log: whether we print to std out stream :return: """ if not isinstance(rev_vocab, np.ndarray): rev_vocab = np.array(rev_vocab) if not isinstance(sample, tuple): sample = (sample, sample) input_batch_size = 100 if training: train_context = dataset['train_context'][:sample[0]] train_question = dataset['train_question'][:sample[0]] train_answer = answers['raw_train_answer'][:sample[0]] train_len = len(train_context) if sendin and len(sendin) > 2: train_a_s, train_a_e = sendin[0:2] else: train_a_e = np.array([], dtype=np.int32) train_a_s = np.array([], dtype=np.int32) for i in tqdm(range(train_len // input_batch_size), desc='trianing set'): # sys.stdout.write('>>> %d / %d \r'%(i, train_len // input_batch_size)) # sys.stdout.flush() train_as, train_ae = self.answer(session, train_context[i * input_batch_size:(i + 1) * input_batch_size], train_question[i * input_batch_size:(i + 1) * input_batch_size]) train_a_s = np.concatenate((train_a_s, train_as), axis=0) train_a_e = np.concatenate((train_a_e, train_ae), axis=0) tf1 = 0. tem = 0. for i, con in enumerate(train_context): sys.stdout.write('>>> %d / %d \r' % (i, train_len)) sys.stdout.flush() prediction_ids = con[0][train_a_s[i]: train_a_e[i] + 1] prediction = rev_vocab[prediction_ids] prediction = ' '.join(prediction) # if i < 10: # print('context: {}'.format(con[0])) # print('prediction: {}'.format( prediction)) # print(' g-truth: {}'.format( train_answer[i])) # print('f1_score: {}'.format(f1_score(prediction, train_answer[i]))) tf1 += f1_score(prediction, train_answer[i]) tem += exact_match_score(prediction, train_answer[i]) if log: logging.info("Training set ==> F1: {}, EM: {}, for {} samples". format(tf1 / train_len, tem / train_len, train_len)) # it was set to 1.0 f1 = 0.0 em = 0.0 val_context = dataset[set_name + '_context'][:sample[1]] val_question = dataset[set_name + '_question'][:sample[1]] # ['Corpus Juris Canonici', 'the Northside', 'Naples', ...] val_answer = answers['raw_val_answer'][:sample[1]] val_len = len(val_context) # logging.info('calculating the validation set predictions.') if sendin and len(sendin) > 2: val_a_s, val_a_e = sendin[-2:] elif sendin: val_a_s, val_a_e = sendin else: val_a_s = np.array([], dtype=np.int32) val_a_e = np.array([], dtype=np.int32) for i in tqdm(range(val_len // input_batch_size), desc='validation '): # sys.stdout.write('>>> %d / %d \r'%(i, val_len // input_batch_size)) # sys.stdout.flush() a_s, a_e = self.answer(session, val_context[i * input_batch_size:(i + 1) * input_batch_size], val_question[i * input_batch_size:(i + 1) * input_batch_size]) val_a_s = np.concatenate((val_a_s, a_s), axis=0) val_a_e = np.concatenate((val_a_e, a_e), axis=0) # logging.info('getting scores of dev set.') for i, con in enumerate(val_context): sys.stdout.write('>>> %d / %d \r' % (i, val_len)) sys.stdout.flush() prediction_ids = con[0][val_a_s[i]: val_a_e[i] + 1] prediction = rev_vocab[prediction_ids] prediction = ' '.join(prediction) # if i < 10: # print('context : {}'.format(con[0])) # print('prediction: {}'.format( prediction)) # print(' g-truth: {}'.format( val_answer[i])) # print('f1_score: {}'.format(f1_score(prediction, val_answer[i]))) f1 += f1_score(prediction, val_answer[i]) em += exact_match_score(prediction, val_answer[i]) if log: logging.info("Validation ==> F1: {}, EM: {}, for {} samples". format(f1 / val_len, em / val_len, val_len)) # pdb.set_trace() if ensemble and training: return train_a_s, train_a_e, val_a_s, val_a_e elif ensemble: return val_a_s, val_a_e # else: # return , train_a_e, val_a_s, val_a_e else: return tf1 / train_len, tem / train_len, f1 / val_len, em / val_len