def colorize_alterations(tokens): out_toks = [] for t in tokens: if 'altered' in t: new_tok = {'originalText': colored(t['originalText'], 'cyan'), 'before': t['before']} out_toks.append(new_tok) else: out_toks.append(t) return corenlp.rejoin(out_toks)
def _get_tokens_for_answers(self, answer_objs, corenlp_obj): """Get CoreNLP tokens corresponding to a SQuAD answer object.""" first_a_toks = None for i, a_obj in enumerate(answer_objs): a_toks = [] answer_start = a_obj['answer_start'] answer_end = answer_start + len(a_obj['text']) for s in corenlp_obj['sentences']: for t in s['tokens']: if t['characterOffsetBegin'] >= answer_end: continue if t['characterOffsetEnd'] <= answer_start: continue a_toks.append(t) if corenlp.rejoin(a_toks).strip() == a_obj['text']: # Make sure that the tokens reconstruct the answer return i, a_toks if i == 0: first_a_toks = a_toks # None of the extracted token lists reconstruct the answer # Default to the first return 0, first_a_toks
def ans_date(a, tokens, q, **kwargs): out_toks = [] if not all(t['ner'] == 'DATE' for t in tokens): return None for t in tokens: if t['pos'] == 'CD' or t['word'].isdigit(): try: value = int(t['word']) except: value = 10 # fallback if value > 50: new_val = str(value - 25) # Year else: # Day of month if value > 15: new_val = str(value - 11) else: new_val = str(value + 11) else: if t['word'].lower() in MONTHS: m_ind = MONTHS.index(t['word'].lower()) new_val = MONTHS[(m_ind + 6) % 12].title() else: # Give up new_val = t['originalText'] out_toks.append({'before': t['before'], 'originalText': new_val}) new_ans = corenlp.rejoin(out_toks).strip() if new_ans == a['text']: return None return new_ans
def alter_question(self, q, tokens, const_parse, strategy='separate'): """Alter the question to make it ask something else. Possible strategies: - separate: Do best alteration for each word separately. - best: Generate exactly one best alteration (may over-alter). - high-conf: Do all possible high-confidence alterations - high-conf-separate: Do best high-confidence alteration for each word separately. - all: Do all possible alterations (very conservative) """ used_words = [t['word'].lower() for t in tokens] new_qs = [] toks_all = [] if strategy.startswith('high-conf'): rules = HIGH_CONF_ALTER_RULES else: rules = ALL_ALTER_RULES for i, t in enumerate(tokens): if t['word'].lower() in DO_NOT_ALTER: if strategy in ('high-conf', 'all'): toks_all.append(t) continue begin = tokens[:i] end = tokens[i+1:] found = False for rule_name in rules: rule = rules[rule_name] new_words = rule(t, nearby_word_dict=self.nearby_word_dict, postag_dict=self.postag_dict) if new_words: for nw in new_words: if nw.lower() in used_words: continue if nw.lower() in BAD_ALTERATIONS: continue # Match capitzliation if t['word'] == t['word'].upper(): nw = nw.upper() elif t['word'] == t['word'].title(): nw = nw.title() new_tok = dict(t) new_tok['word'] = new_tok['lemma'] = new_tok['originalText'] = nw new_tok['altered'] = True # NOTE: obviously this is approximate if strategy.endswith('separate'): new_tokens = begin + [new_tok] + end new_q = corenlp.rejoin(new_tokens) tag = '%s-%d-%s' % (rule_name, i, nw) new_const_parse = corenlp.ConstituencyParse.replace_words( const_parse, [t['word'] for t in new_tokens]) new_qs.append((new_q, new_tokens, new_const_parse, tag)) break elif strategy in ('high-conf', 'all'): toks_all.append(new_tok) found = True break if strategy in ('high-conf', 'all') and found: break if strategy in ('high-conf', 'all') and not found: toks_all.append(t) if strategy in ('high-conf', 'all'): new_q = corenlp.rejoin(toks_all) new_const_parse = corenlp.ConstituencyParse.replace_words( const_parse, [t['word'] for t in toks_all]) if new_q != q: new_qs.append((corenlp.rejoin(toks_all), toks_all, new_const_parse, strategy)) return new_qs
def ans_number(a, tokens, q, **kwargs): out_toks = [] seen_num = False for t in tokens: ner = t['ner'] pos = t['pos'] w = t['word'] out_tok = {'before': t['before']} # Split on dashes leftover = '' dash_toks = w.split('-') if len(dash_toks) > 1: w = dash_toks[0] leftover = '-'.join(dash_toks[1:]) # Try to get a number out value = None if w != '%': # Percent sign should just pass through try: value = float(w.replace(',', '')) except: try: norm_ner = t['normalizedNER'] if norm_ner[0] in ('%', '>', '<'): norm_ner = norm_ner[1:] value = float(norm_ner) except: pass if not value and ( ner == 'NUMBER' or (ner == 'PERCENT' and pos == 'CD')): # Force this to be a number anyways value = 10 if value: if math.isinf(value) or math.isnan(value): value = 9001 seen_num = True if w in ('thousand', 'million', 'billion', 'trillion'): if w == 'thousand': new_val = 'million' else: new_val = 'thousand' else: if value < 2500 and value > 1000: new_val = str(value - 75) else: # Change leading digit if value == int(value): val_chars = list('%d' % value) else: val_chars = list('%g' % value) c = val_chars[0] for i in range(len(val_chars)): c = val_chars[i] if c >= '0' and c <= '9': val_chars[i] = str(max((int(c) + 5) % 10, 1)) break new_val = ''.join(val_chars) if leftover: new_val = '%s-%s' % (new_val, leftover) out_tok['originalText'] = new_val else: out_tok['originalText'] = t['originalText'] out_toks.append(out_tok) if seen_num: return corenlp.rejoin(out_toks).strip() else: return None