def number_match(fileparse): synonyms = set({u'number', u'integer', u'figure', u'digit', u'character', u'symbol', u'cardinal', u'ordinal', u'amount', u'quanity', u'total', u'aggregate', u'tally', u'quota', u'limit'}) pattern = r'[\d\s]+' for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}: check_for_number = False for syn in synonyms: if ratio(fileparse.nps[cid]['text'].lower(), syn) > .9: check_for_number = True if not check_for_number: continue numbers = [] for parse in fileparse.parses: numbers.extend(findall(pattern, parse.text)) longest = '' if numbers: for num in numbers: if len(num) > len(longest): longest = num if longest: aid = _get_cid(fileparse.nps, longest, cid) if not aid: aid = _mk_coref_id() data = {'text': longest, 'ref': None} fileparse.nps[aid] = data fileparse.nps[cid]['ref'] = aid
def word_inclusion(fileparse): for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}: if cid != '4': continue for parse in fileparse.parses: words = [w.lower() for w in word_tokenize(fileparse.nps[cid]['text'])] text = parse.text.lower() anaphor = '' num_found = 0 majority = len(words) / 2 for word in words: if text.find(word + ' ') != -1: num_found += 1 if num_found >= majority: first_index = len(text) last_index = 0 for word in words: if text.find(word + ' ') != -1 and text.find(word + ' ') < first_index: first_index = text.find(word) if text.find(word) != -1 and text.find(word ) + len(word) > last_index: last_index = text.find(word) + len(word) if first_index < last_index: anaphor = text[first_index:last_index] if anaphor: aid = _get_cid(fileparse.nps, anaphor, cid) if not aid: aid = _mk_coref_id() data = {'text': anaphor, 'ref': None} fileparse.nps[aid] = data fileparse.nps[cid]['ref'] = aid break
def pronouns(fileparse): for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}: proposal = hobbs(fileparse, cid) if proposal: text = ' '.join(proposal) aid = _get_cid(fileparse.nps, text, cid) if not aid: aid = _mk_coref_id() data = {'text': text, 'ref': None} fileparse.nps[aid] = data fileparse.nps[cid]['ref'] = aid
def exact_match(fileparse): for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}: num_found = 0 for parse in fileparse.parses: num_found += parse.text.count(fileparse.nps[cid]['text']) if num_found > 1: aid = _get_cid(fileparse.nps, fileparse.nps[cid]['text'], cid) if not aid: aid = _mk_coref_id() data = {'text': fileparse.nps[cid]['text'], 'ref': None} fileparse.nps[aid] = data fileparse.nps[cid]['ref'] = aid
def levenshtein_inclusion(fileparse): for cid in {k: v for k,v in fileparse.nps.items() if not v.get('ref')}: referent = fileparse.nps[cid]['text'].lower() # Search Tagged corefs for aid in fileparse.nps: anaphor = fileparse.nps[aid]['text'].lower() cRatio = 0.6 temp_ratio = ratio(referent, anaphor) if temp_ratio > cRatio: cRatio = temp_ratio fileparse.nps[cid]['ref'] = aid for parse in fileparse.parses: text = parse.text.lower() dist = maxint proposal = '' while len(text) > 2: if distance(text, referent) < dist: dist = distance(text, referent) proposal = text text = text[1:] if distance(text, referent) < dist: dist = distance(text, referent) proposal = text text = text[:-1] if distance(text, referent) < dist: dist = distance(text, referent) proposal = text if ratio(text, referent) > 0.3: aid = _get_cid(fileparse.nps, fileparse.nps[cid]['text'], cid) if not aid: aid = _mk_coref_id() data = {'text': fileparse.nps[cid]['text'], 'ref': None} fileparse.nps[aid] = data fileparse.nps[cid]['ref'] = aid