Exemplo n.º 1
0
def calc_matrix(matrix, gold_mentions, pred_mentions, gold_clusters,
                pred_clusters, sents, trees, heads):
    for id, cluster in gold_clusters.items():
        cluster_size = len(cluster)
        cluster = sorted(cluster)
        for i in xrange(cluster_size):
            for j in xrange(0, i):
                type_i = coref.mention_type(cluster[i], sents, trees, heads)
                type_j = coref.mention_type(cluster[j], sents, trees, heads)
                matrix[type_i, type_j, 'total_gold_pairs'] += 1
                if (cluster[i] in pred_mentions and cluster[j] in pred_mentions
                        and pred_mentions[cluster[i]]
                        == pred_mentions[cluster[j]]):
                    matrix[type_i, type_j, 'correct_gold_pairs'] += 1

    for id, cluster in pred_clusters.items():
        cluster_size = len(cluster)
        cluster = sorted(cluster)
        for i in xrange(cluster_size):
            for j in xrange(0, i):
                type_i = coref.mention_type(cluster[i], sents, trees, heads)
                type_j = coref.mention_type(cluster[j], sents, trees, heads)
                matrix[type_i, type_j, 'total_pred_pairs'] += 1
                if (cluster[i] in gold_mentions and cluster[j] in gold_mentions
                        and gold_mentions[cluster[i]]
                        == gold_mentions[cluster[j]]):
                    matrix[type_i, type_j, 'correct_pred_pairs'] += 1
Exemplo n.º 2
0
def calc_matrix(matrix, gold_mentions, pred_mentions, gold_clusters,
                pred_clusters, sents, trees, heads):
    for id, cluster in gold_clusters.items():
        cluster_size = len(cluster)
        cluster = sorted(cluster)
        for i in xrange(cluster_size):
            for j in xrange(0, i):
                type_i = coref.mention_type(cluster[i], sents, trees, heads)
                type_j = coref.mention_type(cluster[j], sents, trees, heads)
                matrix[type_i, type_j, 'total_gold_pairs'] += 1
                if (cluster[i] in pred_mentions and
                    cluster[j] in pred_mentions and
                    pred_mentions[cluster[i]] == pred_mentions[cluster[j]]):
                    matrix[type_i, type_j, 'correct_gold_pairs'] += 1

    for id, cluster in pred_clusters.items():
        cluster_size = len(cluster)
        cluster = sorted(cluster)
        for i in xrange(cluster_size):
            for j in xrange(0, i):
                type_i = coref.mention_type(cluster[i], sents, trees, heads)
                type_j = coref.mention_type(cluster[j], sents, trees, heads)
                matrix[type_i, type_j, 'total_pred_pairs'] += 1
                if (cluster[i] in gold_mentions and
                    cluster[j] in gold_mentions and
                    gold_mentions[cluster[i]] == gold_mentions[cluster[j]]):
                    matrix[type_i, type_j, 'correct_pred_pairs'] += 1
Exemplo n.º 3
0
def evaluate(data, eval_by_types=False):
    data_stat = {'files': 0, 'docs': 0, 'sents': 0, 'words': 0}
    stat = {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0, 'den_pre': 0.0}
    type_stat = {'name': {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0,
                          'den_pre': 0.0},
                 'nominal': {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0,
                             'den_pre': 0.0},
                 'pronoun': {'num_rec': 0.0, 'den_rec': 0.0, 'num_pre': 0.0,
                             'den_pre': 0.0}}
    for doc in data:
        data_stat['files'] += 1
        for part in data[doc]:
            sents = data[doc][part]['text']
            trees = data[doc][part]['parses']
            heads = data[doc][part]['heads']
            gold = data[doc][part]['mentions']
            for i in xrange(len(sents)):
                data_stat['words'] += len(sents[i])
            data_stat['docs'] += 1
            data_stat['sents'] += len(sents)

            for m in data[doc][part]['pred_mentions']:
                    stat['den_pre'] +=  1
                    mtype = coreference.mention_type(m, sents, trees, heads)
                    type_stat[mtype]['den_pre'] += 1
                    if m in gold:
                        type_stat[mtype]['num_pre'] += 1
                        type_stat[mtype]['num_rec'] += 1
                        stat['num_pre'] += 1
                        stat['num_rec'] += 1

            for g in gold:                    
                mtype = coreference.mention_type(g, sents, trees, heads)
                type_stat[mtype]['den_rec'] += 1
                stat['den_rec'] += 1

    r, p, f1 = error_analyzer.calc_rpf1(stat) 
    logger.info("Data statistics:\n    "
                "files = %d, docs = %d, sentences = %d, words = %d" % (
                data_stat['files'], data_stat['docs'],
                data_stat['sents'], data_stat['words']))

    if eval_by_types:
        print_rpf1_by_types(type_stat, stat)

    logger.info("Performance of mention detection:\n    "
                "P = %2.2lf%% (%d/%d), R = %2.2lf%% (%d/%d), F1 = %2.2lf%%"
                % ((p * 100), stat['num_pre'], stat['den_pre'],
                   (r * 100), stat['num_rec'], stat['den_rec'],
                   (f1 * 100)))

    return True
Exemplo n.º 4
0
def init(doc_ments, sents, trees, heads, sner, speakers):
    doc_attrs = {}
    for sent_ments in doc_ments:
        for ment in sent_ments:
            attr = {}
            attr["type"] = my_constant.MAP_MTYPES[coref.mention_type(ment, sents, trees, heads)]
            attr["surface"] = coref.mention_text(ment, sents).lower()
            set_head(attr, ment, sents, trees, heads)
            set_first_word(attr, ment, sents, trees, heads)
            set_ner(attr, ment, sner)
            attr["relaxed_surface"] = remove_phrase_after_head(attr, ment, sents, trees, heads)
            attr["word_list"] = extract_word_list(attr)
            attr["modifiers"] = extract_modifiers(attr, ment, sents, trees, heads)
            extract_properties(attr, ment, sents)
            set_speaker(attr, ment, speakers)
            attr["pleonastic"] = is_pleonastic(attr, ment, sents)
            doc_attrs[ment] = attr

    return doc_attrs
Exemplo n.º 5
0
def init(doc_ments, sents, trees, heads, sner, speakers):
    doc_attrs = {}
    for sent_ments in doc_ments:
        for ment in sent_ments:
            attr = {}
            attr['type'] = my_constant.MAP_MTYPES[coref.mention_type(
                ment, sents, trees, heads)]
            attr['surface'] = coref.mention_text(ment, sents).lower()
            set_head(attr, ment, sents, trees, heads)
            set_first_word(attr, ment, sents, trees, heads)
            set_ner(attr, ment, sner)
            attr['relaxed_surface'] = remove_phrase_after_head(
                attr, ment, sents, trees, heads)
            attr['word_list'] = extract_word_list(attr)
            attr['modifiers'] = extract_modifiers(attr, ment, sents, trees,
                                                  heads)
            extract_properties(attr, ment, sents)
            set_speaker(attr, ment, speakers)
            attr['pleonastic'] = is_pleonastic(attr, ment, sents)
            doc_attrs[ment] = attr

    return doc_attrs