예제 #1
0
def is_generic_you(attr, ment, sents):
    if not (ment[2] - ment[1] == 1 and attr['head_word'] == 'you'):
        return False

    if ment[2] + 1 <= len(sents[ment[0]]):
        next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents)
        if next_word.lower() == 'know':
            return True

    if ment[1] - 1 >= 0:
        prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents)
        if prev_word.lower() == 'thank':
            return True

    return False
예제 #2
0
def is_generic_you(attr, ment, sents):
    if not (ment[2] - ment[1] == 1 and attr["head_word"] == "you"):
        return False

    if ment[2] + 1 <= len(sents[ment[0]]):
        next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents)
        if next_word.lower() == "know":
            return True

    if ment[1] - 1 >= 0:
        prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents)
        if prev_word.lower() == "thank":
            return True

    return False
예제 #3
0
def has_following_apostrophe(ment, sents):
    if ment[2] + 1 <= len(sents[ment[0]]):
        next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents)
        if next_word == "'s":
            return True

    return False
예제 #4
0
def remove_conll_spurious_mentions(ments, sents, trees, heads, sner, params):
    ments_to_remove = []
    for ment in ments.keys():
        if is_generic(ment, sents, trees, heads):
            ments_to_remove.append(ment)

        surface = coref.mention_text(ment, sents).lower()

        if (ment in sner[ment[0]] and sner[ment[0]][ment] == 'GPE'
                and surface in dictionaries.gpe_acronyms):
            ments_to_remove.append(ment)

        if surface in dictionaries.stop_words:
            ments_to_remove.append(ment)

        if start_with_stop_prefixes(surface):
            ments_to_remove.append(ment)

        if end_with_stop_suffixes(surface):
            ments_to_remove.append(ment)

        head_span, head_word, head_pos = \
            coref.mention_head(ment, sents, trees, heads)
        tmp = (ment[0], head_span[0], head_span[1])
        if (tmp in sner[ment[0]]
                and sner[ment[0]][tmp] in {'PERCENT', 'MONEY'}):
            ments_to_remove.append(ment)

    for r in ments_to_remove:
        if r in ments:
            ments.pop(r)
예제 #5
0
def is_pleonastic(attr, ment, sents, gold_ments=None):
    if attr["type"] != my_constant.MAP_MTYPES["pronoun"]:
        return False

    if attr["surface"] == "it":
        if ment[2] + 1 <= len(sents[ment[0]]):
            next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents)
            if next_word.lower() in dictionaries.pleonastic_words:
                return True

    if attr["surface"] == "you":
        if ment[2] + 1 <= len(sents[ment[0]]):
            next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1), sents)
            if next_word.lower() == "know":
                return True
        if ment[1] - 1 >= 0:
            prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]), sents)
            if prev_word.lower() == "thank":
                return True

    return False
예제 #6
0
def is_pleonastic(attr, ment, sents, gold_ments=None):
    if attr['type'] != my_constant.MAP_MTYPES['pronoun']:
        return False

    if attr['surface'] == 'it':
        if ment[2] + 1 <= len(sents[ment[0]]):
            next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1),
                                           sents)
            if next_word.lower() in dictionaries.pleonastic_words:
                return True

    if attr['surface'] == 'you':
        if ment[2] + 1 <= len(sents[ment[0]]):
            next_word = coref.mention_text((ment[0], ment[2], ment[2] + 1),
                                           sents)
            if next_word.lower() == 'know':
                return True
        if ment[1] - 1 >= 0:
            prev_word = coref.mention_text((ment[0], ment[1] - 1, ment[1]),
                                           sents)
            if prev_word.lower() == 'thank':
                return True

    return False
예제 #7
0
def print_missing_gold_mentions(data):
    out = open('missing_gold_mentions.log', 'w')
    order = []
    for doc in data:
        for part in data[doc]:
            order.append((doc, part))
    order.sort()
    for doc, part in order:
        sents = data[doc][part]['text']
        trees = data[doc][part]['parses']
        names = data[doc][part]['ner']
        gold_mentions = data[doc][part]['mentions']
        doc_mentions = data[doc][part]['doc_mentions']
        print >> out, "# %s %s\n" % (doc, part)
        num_missing = 0
        num_not_con = 0

        pred = {}
        for ments in doc_mentions:
            for m in ments:
                pred[m] = True

        gold = [g for g in gold_mentions]
        gold.sort()
        for g in gold:
            if g not in pred:
                num_missing += 1
                node = trees[g[0]].get_nodes('lowest', g[1], g[2])
                con = 1
                if node is None:
                    num_not_con += 1
                    con = 0
                ner = 1
                if g not in names:
                    ner = 0
                print >> out, "%s\t%d\t%d\t%s" % (g, con, ner,
                                                  coref.mention_text(g, sents))

        if num_missing > 0:
            print >> out, "\n#missing mentions = %d" % num_missing,
            print >> out, "(%d are not constituents)\n" % num_not_con

    return True
예제 #8
0
def print_missing_gold_mentions(data):
    out = open('missing_gold_mentions.log', 'w')
    order = []
    for doc in data:
        for part in data[doc]:
            order.append((doc, part))
    order.sort()
    for doc, part in order:
        sents = data[doc][part]['text']
        trees = data[doc][part]['parses']
        names = data[doc][part]['ner']
        gold_mentions = data[doc][part]['mentions']
        doc_mentions = data[doc][part]['doc_mentions']
        print >> out, "# %s %s\n" % (doc, part)
        num_missing = 0
        num_not_con = 0

        pred = {}
        for ments in doc_mentions:
            for m in ments: 
                pred[m] = True

        gold = [g for g in gold_mentions]
        gold.sort() 
        for g in gold:
            if g not in pred:
                num_missing += 1
                node = trees[g[0]].get_nodes('lowest', g[1], g[2])
                con = 1
                if node is None:
                    num_not_con += 1
                    con = 0
                ner = 1
                if g not in names:
                    ner = 0
                print >> out, "%s\t%d\t%d\t%s" % (g, con, ner,
                    coref.mention_text(g, sents))

        if num_missing > 0:
            print >> out, "\n#missing mentions = %d" % num_missing,
            print >> out, "(%d are not constituents)\n" % num_not_con

    return True
예제 #9
0
def init(doc_ments, sents, trees, heads, sner, speakers):
    doc_attrs = {}
    for sent_ments in doc_ments:
        for ment in sent_ments:
            attr = {}
            attr["type"] = my_constant.MAP_MTYPES[coref.mention_type(ment, sents, trees, heads)]
            attr["surface"] = coref.mention_text(ment, sents).lower()
            set_head(attr, ment, sents, trees, heads)
            set_first_word(attr, ment, sents, trees, heads)
            set_ner(attr, ment, sner)
            attr["relaxed_surface"] = remove_phrase_after_head(attr, ment, sents, trees, heads)
            attr["word_list"] = extract_word_list(attr)
            attr["modifiers"] = extract_modifiers(attr, ment, sents, trees, heads)
            extract_properties(attr, ment, sents)
            set_speaker(attr, ment, speakers)
            attr["pleonastic"] = is_pleonastic(attr, ment, sents)
            doc_attrs[ment] = attr

    return doc_attrs
예제 #10
0
def init(doc_ments, sents, trees, heads, sner, speakers):
    doc_attrs = {}
    for sent_ments in doc_ments:
        for ment in sent_ments:
            attr = {}
            attr['type'] = my_constant.MAP_MTYPES[coref.mention_type(
                ment, sents, trees, heads)]
            attr['surface'] = coref.mention_text(ment, sents).lower()
            set_head(attr, ment, sents, trees, heads)
            set_first_word(attr, ment, sents, trees, heads)
            set_ner(attr, ment, sner)
            attr['relaxed_surface'] = remove_phrase_after_head(
                attr, ment, sents, trees, heads)
            attr['word_list'] = extract_word_list(attr)
            attr['modifiers'] = extract_modifiers(attr, ment, sents, trees,
                                                  heads)
            extract_properties(attr, ment, sents)
            set_speaker(attr, ment, speakers)
            attr['pleonastic'] = is_pleonastic(attr, ment, sents)
            doc_attrs[ment] = attr

    return doc_attrs