def get_decomposed(orig_question, prediction, prediction2, is_bridge, with_key): while ' ' in orig_question: orig_question = orig_question.replace(' ', ' ') if is_bridge: question1 = prediction2 if with_key else prediction question2 = orig_question.replace(prediction, '[ANSWER]') assert '[ANSWER]' in question2 for token in [', [ANSWER]', '[ANSWER] ,', '[ANSWER] who', \ '[ANSWER] when', '[ANSWER] where', '[ANSWER] which', \ '[ANSWER] that', '[ANSWER] whose']: if token in question2: if token == '[ANSWER] whose': question = question2.replace(token, " [ANSWER] 's ") else: question2 = question2.replace(token, ' [ANSWER] ') else: orig_question_tokens = orig_question.split(' ') prediction_tokens = prediction.split(' ') start, end = None, None for i in range(len(orig_question_tokens) - len(prediction_tokens) + 1): if orig_question_tokens[i:i + len(prediction_tokens )] == prediction_tokens: start, end = i, i + len(prediction_tokens) break if start is None and end is None: for i in range( len(orig_question_tokens) - len(prediction_tokens) + 1): text = ' '.join(orig_question_tokens[i:i + len(prediction_tokens)]) if normalize_answer(text) == normalize_answer(prediction): start, end = i, i + len(prediction_tokens) break if start is None and end is None: for i in range( len(orig_question_tokens) - len(prediction_tokens) + 1): text = ' '.join(orig_question_tokens[i:i + len(prediction_tokens)]) if normalize_answer(text).startswith( normalize_answer(prediction)): start, end = i, len(orig_question_tokens) print("==== to long question ====") print(' '.join(orig_question_tokens)) print(' '.join(orig_question_tokens[start:end])) break assert start is not None and end is not None question1, question2 = intersection_convert_to_queries( orig_question_tokens, start, end - 1) question1, question2 = ' '.join(question1), ' '.join(question2) return orig_question, question1, question2
def evaluate_question_detector(questions: List[HotpotQuestion], word_tokenize, detector, reference_detector=None, compute_f1s=False): """ Just for debugging """ n_no_docs = 0 answer_per_q = [] answer_f1s = [] for question_ix, q in enumerate(tqdm(questions)): if q.answer in {'yes', 'no'} and q.q_type == 'comparison': continue tokenized_aliases = [word_tokenize(q.answer)] detector.set_question(tokenized_aliases) output = [] for i, par in enumerate(q.supporting_facts): for s ,e in detector.any_found(par.sentences): output.append((i, s, e)) if len(output) == 0 and reference_detector is not None: if reference_detector is not None: reference_detector.set_question(tokenized_aliases) detected = [] for j, par in enumerate(q.supporting_facts): for s, e in reference_detector.any_found(par.sentences): detected.append((j, s, e)) if len(detected) > 0: print("Found a difference") print(q.answer.normalized_aliases) print(tokenized_aliases) for p, s, e in detected: token = flatten_iterable(q.supporting_facts[p].sentences)[s:e] print(token) answer_per_q.append(output) if compute_f1s: f1s = [] for p, s, e in output: token = flatten_iterable(q.supporting_facts[p].sentences)[s:e] answer = normalize_answer(" ".join(token)) f1, _, _ = f1_score(answer, normalize_answer(q.answer)) f1s.append(f1) answer_f1s.append(f1s) n_answers = sum(len(x) for x in answer_per_q) print("Found %d answers (av %.4f)" % (n_answers, n_answers / len(answer_per_q))) print("%.4f docs have answers" % np.mean([len(x) > 0 for x in answer_per_q])) if len(answer_f1s) > 0: print("Average f1 is %.4f" % np.mean(flatten_iterable(answer_f1s)))
def _normalize_answer(text): if '<title>' in text: text = text.replace('<title>', '') if '</title>' in text: text = text.replace('</title>', '') list1 = ['/title>'[i:] for i in range(len('/title>'))] list2 = ['</title>'[:-i] for i in range(1, len('</title>'))] + \ ['<title>'[:-i] for i in range(1, len('<title>'))] for prefix in list1: if text.startswith(prefix): text = text[len(prefix):] for prefix in list2: if text.endswith(prefix): text = text[:-len(prefix)] if '(' in text and ')' not in text: texts = [t.strip() for t in text.split('(')] text = texts[np.argmax([len(t) for t in texts])] if ')' in text and '(' not in text: texts = [t.strip() for t in text.split(')')] text = texts[np.argmax([len(t) for t in texts])] text = normalize_answer(text) return text
def run(json_file_name, answer_file_name, eval_file_name): import json with open(json_file_name) as f: data = json.load(f) from qa.my_main import DecompRC model = DecompRC(batch_size=50) for d in data: id = d['_id'] a = normalize_answer(d['answer']) q = d['question'] p = d['context'] (q1_b, q2_b), (q1_i, q2_i) = model.get_output("span-predictor", q, p) print("Q : {}".format(q)) print("A : {}".format(a)) print("Q1: {}".format(q1_b)) first_answers = one_hop_answers(p, q1_b, 5) print("A1: {}".format([shit[0] for shit in first_answers])) next_question = q2_b.replace("[ANSWER]", first_answers[0][0]) print("Q2: {}".format(next_question)) second_answers = one_hop_answers(p, next_question, 5) print("A2: {}".format([shit[0] for shit in second_answers])) print("========================================") input()
def run(json_file_name, answer_file_name, eval_file_name): import json with open(json_file_name) as f: data = json.load(f) from qa.my_main import DecompRC model = DecompRC(batch_size=50) fscores = [0, 0, 0] ems = [0, 0, 0] precision = [0, 0, 0] recall = [0, 0, 0] SEETHISID = "5a81b2505542995ce29dcc32" FLAG = False for d in data: id = d['_id'] if not FLAG: if SEETHISID == id: FLAG = True continue a = normalize_answer(d['answer']) q = d['question'] p = d['context'] if len(p) == 0: continue (q1_b, q2_b), (q1_i, q2_i) = model.get_output("span-predictor", q, p) print("Q : {}".format(q)) print("A : {}".format(a)) first_answer, _ = best_one_hop_answer(p, q) next_question = q2_b.replace("[ANSWER]", first_answer) bridge_answer, bridge_score = best_one_hop_answer(p, next_question) bridge_answer = normalize_answer(bridge_answer) print("A-B: {}".format(bridge_answer)) common_answers = [] k = 10 while len(common_answers) == 0: first_answers = best_k_answers(p, q1_i, k) second_answers = best_k_answers(p, q2_i, k) second_answers_set = set([tup[0] for tup in second_answers]) common_answers = [ tup for tup in first_answers if tup[0] in second_answers_set ] k += 10 intersec_answer = common_answers[0][0] intersec_score = common_answers[0][1] for ca in common_answers: if ca[1] > intersec_score: intersec_score = ca[1] intersec_answer = ca[0] intersec_answer = normalize_answer(intersec_answer) print("A-I: {}".format(intersec_answer)) ultimate_answer = bridge_answer if intersec_score > bridge_score: ultimate_answer = intersec_answer print("A-C: {}".format(ultimate_answer)) print("========================================") f1, prcsn, rcll = f1_score(bridge_answer, a) fscores[0] += f1 precision[0] += prcsn recall[0] += rcll ems[0] += bridge_answer == a f1, prcsn, rcll = f1_score(intersec_answer, a) fscores[1] += f1 precision[1] += prcsn recall[1] += rcll ems[1] += bridge_answer == a f1, prcsn, rcll = f1_score(ultimate_answer, a) fscores[2] += f1 precision[2] += prcsn recall[2] += rcll ems[2] += bridge_answer == a with open(answer_file_name, mode='a') as file: row = [ id, q, a, bridge_answer, bridge_score, intersec_answer, intersec_score, ultimate_answer ] writer = csv.writer(file) writer.writerow(row) N = len(data) fscores = [i / N for i in fscores] ems = [i / N for i in ems] precision = [i / N for i in fscores] recall = [i / N for i in ems] with open(eval_file_name, mode='a') as file: writer = csv.writer(file) writer.writerow(fscores) writer.writerow(precision) writer.writerow(recall) writer.writerow(ems)
def get_span_prediction(examples, features, result, with_keyword): prelim_predictions = [] yn_predictions = [] assert len(examples) == 1 example = examples[0] feature = sorted(features, key=lambda f: f.unique_id)[0] gold_start_positions = feature.start_position gold_end_positions = feature.end_position _PrelimPrediction = collections.namedtuple( # pylint: disable=invalid-name "PrelimPrediction", ["start_index", "end_index", "keyword_index", "logit"]) if len(result) != 1: from IPython import embed embed() result = result[0] switch = np.argmax(result.switch) if switch == 1: prelim_predictions.append( _PrelimPrediction(start_index=-1, end_index=-1, keyword_index=-1, logit=result.switch[1])) elif switch == 0: scores = [] start_logits = result.start_logits[:len(feature.tokens)] end_logits = result.end_logits[:len(feature.tokens)] if with_keyword: keyword_logits = result.keyword_logits[:len(feature.tokens)] for (i, s) in enumerate(start_logits): for (j, e) in enumerate(end_logits[i:]): for (k, key) in enumerate(keyword_logits[i:i + j + 1]): scores.append(((i, i + j, i + k), s + e + key)) else: for (i, s) in enumerate(start_logits): for (j, e) in enumerate(end_logits[i:]): scores.append(((i, i + j, i), s + e)) scores = sorted(scores, key=lambda x: x[1], reverse=True) for (start_index, end_index, keyword_index), score in scores: if start_index >= len(feature.tokens): continue if end_index >= len(feature.tokens): continue if not (start_index <= keyword_index <= end_index): continue if start_index not in feature.token_to_orig_map or end_index not in feature.token_to_orig_map: continue if start_index - 1 in feature.token_to_orig_map and feature.token_to_orig_map[ start_index - 1] == feature.token_to_orig_map[start_index]: continue if end_index + 1 in feature.token_to_orig_map and feature.token_to_orig_map[ end_index + 1] == feature.token_to_orig_map[end_index]: continue if end_index < start_index: continue length = end_index - start_index if length <= 2: continue prelim_predictions.append( _PrelimPrediction(start_index=start_index, end_index=end_index, keyword_index=keyword_index, logit=score)) else: raise NotImplementedError() prelim_predictions = sorted(prelim_predictions, key=lambda x: x.logit, reverse=True) _NbestPrediction = collections.namedtuple( # pylint: disable=invalid-name "NbestPrediction", ["text", "text2", "logit"]) seen_predictions = {} nbest = [] def get_text(start_index, end_index, keyword_index): if start_index == end_index == -1: final_text = example.all_answers[-1] else: feature = features[0] tok_tokens = feature.tokens[start_index:(end_index + 1)] orig_doc_start = feature.token_to_orig_map[start_index] orig_doc_end = feature.token_to_orig_map[end_index] orig_doc_keyword = feature.token_to_orig_map[keyword_index] orig_tokens = feature.doc_tokens[orig_doc_start:(orig_doc_end + 1)] orig_tokens2 = orig_tokens.copy() for i in range(orig_doc_keyword, orig_doc_keyword - 5, -1): if i - orig_doc_start < 0: break if orig_tokens[i - orig_doc_start] in ['the', 'a', 'an']: orig_tokens2[i - orig_doc_start] = 'which' assert orig_tokens[i - orig_doc_start] != 'which' break tok_text = " ".join(tok_tokens) # De-tokenize WordPieces that have been split off. tok_text = tok_text.replace(" ##", "") tok_text = tok_text.replace("##", "") # Clean whitespace tok_text = tok_text.strip() tok_text = " ".join(tok_text.split()) final_text = get_final_text(tok_text, " ".join(orig_tokens)) final_text2 = get_final_text(tok_text, " ".join(orig_tokens2)) return final_text, final_text2 for pred in prelim_predictions: prediction, prediction2 = get_text(pred.start_index, pred.end_index, pred.keyword_index) orig_question = ' '.join(example.doc_tokens) if with_keyword: question1 = prediction2 if with_keyword else prediction question2 = orig_question.replace(prediction, '[ANSWER]') assert '[ANSWER]' in question2 for token in [', [ANSWER]', '[ANSWER] ,', '[ANSWER] who', \ '[ANSWER] when', '[ANSWER] where', '[ANSWER] which', \ '[ANSWER] that', '[ANSWER] whose']: if token in question2: if token == '[ANSWER] whose': question = question2.replace(token, " [ANSWER] 's ") else: question2 = question2.replace(token, ' [ANSWER] ') else: orig_question_tokens = orig_question.split(' ') prediction_tokens = prediction.split(' ') start, end = None, None for i in range( len(orig_question_tokens) - len(prediction_tokens) + 1): if orig_question_tokens[i:i + len(prediction_tokens )] == prediction_tokens: start, end = i, i + len(prediction_tokens) break if start is None and end is None: for i in range( len(orig_question_tokens) - len(prediction_tokens) + 1): text = ' '.join( orig_question_tokens[i:i + len(prediction_tokens)]) if normalize_answer(text) == normalize_answer(prediction): start, end = i, i + len(prediction_tokens) break if start is None and end is None: for i in range( len(orig_question_tokens) - len(prediction_tokens) + 1): text = ' '.join( orig_question_tokens[i:i + len(prediction_tokens)]) if normalize_answer(text).startswith( normalize_answer(prediction)): start, end = i, len(orig_question_tokens) print("==== to long question ====") print(' '.join(orig_question_tokens)) print(' '.join(orig_question_tokens[start:end])) break try: assert start is not None and end is not None except Exception: print(orig_question) print(prediction) try: question1, question2 = intersection_convert_to_queries( orig_question_tokens, start, end - 1) except Exception: embed() assert False question1, question2 = ' '.join(question1), ' '.join(question2) def postprocess(question): question = question.strip() while ' ' in question: question = question.replace(' ', ' ') if not question.endswith('?'): question += '?' while question.replace(' ', '').endswith('??'): question = question[:-1] return question return postprocess(question1), postprocess(question2)