def segment(ex, vocab, threshold=0.25): s, e = ex['span'] span = ex['inp'][s:e + 1] span_str = detokenize(span) ques = ex['question'] best_i, best_j, best_score = None, None, -1 for i in range(len(ques)): for j in range(i, len(ques)): chunk = detokenize(ques[i:j + 1]) score = compute_f1(span_str, chunk) if score > best_score: best_score, best_i, best_j = score, i, j if best_score > threshold: before = ex['question'][:best_i] after = ex['question'][best_j + 1:] ret = { 'before': get_orig(before), 'after': get_orig(after), } ret.update({ k + '_vids': torch.tensor(vocab.word2index(v + ['eos']), dtype=torch.long) for k, v in ret.items() }) return ret else: return None
def compute_metrics(self, preds, data): metrics = compute_metrics(preds, data) f1s = [] for p, ex in zip(preds, data): pspans = [ detokenize(ex['feat']['inp'][s:e + 1]) for s, e in p['spans'] ] gspans = [ detokenize(ex['feat']['inp'][s:e + 1]) for s, e in ex['feat']['spans'] ] f1s.append(compute_f1('\n'.join(gspans), '\n'.join(pspans))) metrics['span_f1'] = sum(f1s) / len(f1s) return metrics
def compute_entailment(self, spans, ex): chunks = [detokenize(ex['feat']['inp'][s:e + 1]) for s, e in spans] history = [0] * len(chunks) scenario = [0] * len(chunks) # history for i, c in enumerate(chunks): for q in ex['ann']['hquestion']: history[i] = max(history[i], compute_f1(c, detokenize(q))) scenario[i] = max(scenario[i], compute_f1(c, detokenize(ex['ann']['scenario']))) entail = torch.tensor([history, scenario], dtype=torch.float, device=self.device).t() return entail
def extract_preds(self, out, batch, top_k=20): scores = out['scores'] ystart, yend = scores.split(1, dim=-1) pstart = F.softmax(ystart.squeeze(-1), dim=1) pend = F.softmax(yend.squeeze(-1), dim=1) preds = [] for pstart_i, pend_i, ex in zip(pstart, pend, batch): top_start = self.get_top_k(pstart_i, top_k) top_end = self.get_top_k(pend_i, top_k) top_preds = [] for s, ps in top_start: for e, pe in top_end: if e >= s: top_preds.append((s, e, ps * pe)) top_preds = sorted(top_preds, key=lambda tup: tup[-1], reverse=True)[:top_k] top_answers = [(detokenize(ex['feat']['inp'][s:e + 1]), s, e, p) for s, e, p in top_preds] top_ans, top_s, top_e, top_p = top_answers[0] preds.append({ 'utterance_id': ex['utterance_id'], 'top_k': top_answers, 'answer': top_ans, 'spans': [(top_s, top_e)], 'retrieve_span': 0, }) return preds
def extract_preds(self, out, batch, top_k=20): preds = [] for ex, clf_i, retrieve_i, spans_i, edit_scores_i in zip( batch, out['clf_scores'].max(1)[1].tolist(), out['retrieve_scores'].max(1)[1].tolist(), out['spans'], out['edit_scores']): a = CLASSES[clf_i] edit_ids = edit_scores_i.max(2)[1].tolist() edits = [] for ids in edit_ids: words = self.vocab.index2word(ids) if 'eos' in words: words = words[:words.index('eos')] edits.append(' '.join(words)) r = None if a == 'more': s, e = spans_i[retrieve_i] r = detokenize(ex['feat']['inp'][s:e + 1]) a = edits[retrieve_i] preds.append({ 'utterance_id': ex['utterance_id'], 'retrieval': r, 'answer': a, 'spans': spans_i, }) return preds
def extract_bullets(self, spans, ex): mask = ex['feat']['pointer_mask'].tolist() classes_start = mask.index(1) snippet_start = classes_start + 5 snippet_end = snippet_start + mask[snippet_start:].index(0) bullet_inds = [ i for i in range(snippet_start, snippet_end) if ex['feat']['inp'][i]['sub'] == '*' ] if bullet_inds: bullets = [ (s + 1, e - 1) for s, e in zip(bullet_inds, bullet_inds[1:] + [snippet_end]) if e - 1 >= s + 1 ] non_bullet_spans = [] for s, e in spans: gloss = detokenize(ex['feat']['inp']) if '*' not in gloss and '\n' not in gloss: non_bullet_spans.append((s, e)) all_spans = bullets + non_bullet_spans all_spans.sort(key=lambda tup: tup[1] - tup[0], reverse=True) covered = [False] * len(ex['feat']['inp']) keep = [] for s, e in all_spans: if not all(covered[s:e + 1]): for i in range(s, e + 1): covered[i] = True keep.append((s, e)) return keep else: return spans
def extract_preds(self, out, batch, top_k=20): preds = super().extract_preds(out, batch, top_k=top_k) for ex, p, span_i, clf_i, retrieve_i, entail_i in zip( batch, preds, out['span_scores'], out['clf_scores'], out['retrieve_scores'], out['entail']): p['clf_scores'] = dict( list(zip(CLASSES, F.softmax(clf_i, dim=0).tolist()))) spans = [ detokenize(ex['feat']['inp'][s:e + 1]) for s, e in p['spans'] ] p['span_scores'] = dict( list(zip(spans, F.softmax(retrieve_i, dim=0).tolist()))) p['words'] = [ w['sub'] for w in ex['feat']['inp'] if w['orig'] != 'pad' ] p['og'] = { k: v for k, v in ex.items() if k in ['snippet', 'scenario', 'question', 'history', 'answer'] } p['start_scores'] = span_i[:, 0].tolist() p['end_scores'] = span_i[:, 1].tolist() p['entail_hist_scores'] = dict( list(zip(spans, entail_i[:, 0].tolist()))) p['entail_scen_scores'] = dict( list(zip(spans, entail_i[:, 1].tolist()))) return preds
def extract_preds(self, out, batch, top_k=20): preds = [] for ex, clf_i, retrieve_i, span_i in zip( batch, out['clf_scores'].max(1)[1].tolist(), out['retrieve_scores'].max(1)[1].tolist(), out['spans']): a = CLASSES[clf_i] if a == 'more': s, e = span_i[retrieve_i] a = detokenize(ex['feat']['inp'][s:e + 1]) preds.append({ 'utterance_id': ex['utterance_id'], 'answer': a, 'spans': span_i, 'retrieve_span': retrieve_i, }) return preds
def extract_preds(self, out, batch): preds = [] for before, after, ex in zip(out['before'].max(2)[1].tolist(), out['after'].max(2)[1].tolist(), batch): before = self.vocab.index2word(before) if 'eos' in before: before = before[:before.index('eos')] after = self.vocab.index2word(after) if 'eos' in after: after = after[:after.index('eos')] s, e = ex['span'] middle = detokenize(ex['inp'][s:e + 1]) preds.append({ 'utterance_id': ex['utterance_id'], 'answer': '{} {} {}'.format(' '.join(before), middle, ' '.join(after)), }) return preds
def extract_spans(self, span_scores, batch): pstart, pend = span_scores.split(1, dim=-1) spans = [] for pstart_i, pend_i, ex in zip(pstart.squeeze(-1), pend.squeeze(-1), batch): spans_i = [] sthresh = min(pstart_i.max(), self.args.thresh) start = pstart_i.ge(sthresh).tolist() for si, strig in enumerate(start): if strig: ethresh = min(pend_i[si:].max(), self.args.thresh) end = pend_i[si:].ge(ethresh).tolist() for ei, etrig in enumerate(end): ei += si if etrig: spans_i.append( (si, ei, detokenize(ex['feat']['inp'][si:ei + 1]), pstart_i[si].item(), pend_i[ei].item())) break spans.append(spans_i) return spans
def compute_metrics(self, preds, batch): f1s = [compute_f1(p['answer'], detokenize(e['question'])) for p, e in zip(preds, batch)] return {'f1': sum(f1s) / len(f1s)}