def is_generic_you(attr, ment, sents): if not (ment[2] - ment[1] == 1 and attr['head_word'] == 'you'): return False if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word.lower() == 'know': return True if ment[1] - 1 >= 0: prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents) if prev_word.lower() == 'thank': return True return False
def is_generic_you(attr, ment, sents): if not (ment[2] - ment[1] == 1 and attr["head_word"] == "you"): return False if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word.lower() == "know": return True if ment[1] - 1 >= 0: prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents) if prev_word.lower() == "thank": return True return False
def has_following_apostrophe(ment, sents): if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word == "'s": return True return False
def remove_conll_spurious_mentions(ments, sents, trees, heads, sner, params): ments_to_remove = [] for ment in ments.keys(): if is_generic(ment, sents, trees, heads): ments_to_remove.append(ment) surface = coref.mention_text(ment, sents).lower() if (ment in sner[ment[0]] and sner[ment[0]][ment] == 'GPE' and surface in dictionaries.gpe_acronyms): ments_to_remove.append(ment) if surface in dictionaries.stop_words: ments_to_remove.append(ment) if start_with_stop_prefixes(surface): ments_to_remove.append(ment) if end_with_stop_suffixes(surface): ments_to_remove.append(ment) head_span, head_word, head_pos = \ coref.mention_head(ment, sents, trees, heads) tmp = (ment[0], head_span[0], head_span[1]) if (tmp in sner[ment[0]] and sner[ment[0]][tmp] in {'PERCENT', 'MONEY'}): ments_to_remove.append(ment) for r in ments_to_remove: if r in ments: ments.pop(r)
def is_pleonastic(attr, ment, sents, gold_ments=None): if attr["type"] != my_constant.MAP_MTYPES["pronoun"]: return False if attr["surface"] == "it": if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word.lower() in dictionaries.pleonastic_words: return True if attr["surface"] == "you": if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word.lower() == "know": return True if ment[1] - 1 >= 0: prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents) if prev_word.lower() == "thank": return True return False
def is_pleonastic(attr, ment, sents, gold_ments=None): if attr['type'] != my_constant.MAP_MTYPES['pronoun']: return False if attr['surface'] == 'it': if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word.lower() in dictionaries.pleonastic_words: return True if attr['surface'] == 'you': if ment[2] + 1 <= len(sents[ment[0]]): next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents) if next_word.lower() == 'know': return True if ment[1] - 1 >= 0: prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents) if prev_word.lower() == 'thank': return True return False
def print_missing_gold_mentions(data): out = open('missing_gold_mentions.log', 'w') order = [] for doc in data: for part in data[doc]: order.append((doc, part)) order.sort() for doc, part in order: sents = data[doc][part]['text'] trees = data[doc][part]['parses'] names = data[doc][part]['ner'] gold_mentions = data[doc][part]['mentions'] doc_mentions = data[doc][part]['doc_mentions'] print >> out, "# %s %s\n" % (doc, part) num_missing = 0 num_not_con = 0 pred = {} for ments in doc_mentions: for m in ments: pred[m] = True gold = [g for g in gold_mentions] gold.sort() for g in gold: if g not in pred: num_missing += 1 node = trees[g[0]].get_nodes('lowest', g[1], g[2]) con = 1 if node is None: num_not_con += 1 con = 0 ner = 1 if g not in names: ner = 0 print >> out, "%s\t%d\t%d\t%s" % (g, con, ner, coref.mention_text(g, sents)) if num_missing > 0: print >> out, "\n#missing mentions = %d" % num_missing, print >> out, "(%d are not constituents)\n" % num_not_con return True
def init(doc_ments, sents, trees, heads, sner, speakers): doc_attrs = {} for sent_ments in doc_ments: for ment in sent_ments: attr = {} attr["type"] = my_constant.MAP_MTYPES[coref.mention_type(ment, sents, trees, heads)] attr["surface"] = coref.mention_text(ment, sents).lower() set_head(attr, ment, sents, trees, heads) set_first_word(attr, ment, sents, trees, heads) set_ner(attr, ment, sner) attr["relaxed_surface"] = remove_phrase_after_head(attr, ment, sents, trees, heads) attr["word_list"] = extract_word_list(attr) attr["modifiers"] = extract_modifiers(attr, ment, sents, trees, heads) extract_properties(attr, ment, sents) set_speaker(attr, ment, speakers) attr["pleonastic"] = is_pleonastic(attr, ment, sents) doc_attrs[ment] = attr return doc_attrs
def init(doc_ments, sents, trees, heads, sner, speakers): doc_attrs = {} for sent_ments in doc_ments: for ment in sent_ments: attr = {} attr['type'] = my_constant.MAP_MTYPES[coref.mention_type( ment, sents, trees, heads)] attr['surface'] = coref.mention_text(ment, sents).lower() set_head(attr, ment, sents, trees, heads) set_first_word(attr, ment, sents, trees, heads) set_ner(attr, ment, sner) attr['relaxed_surface'] = remove_phrase_after_head( attr, ment, sents, trees, heads) attr['word_list'] = extract_word_list(attr) attr['modifiers'] = extract_modifiers(attr, ment, sents, trees, heads) extract_properties(attr, ment, sents) set_speaker(attr, ment, speakers) attr['pleonastic'] = is_pleonastic(attr, ment, sents) doc_attrs[ment] = attr return doc_attrs