Exemplo n.º 1
0
def __nil_clustering(nom_dict_file, edl_file, dst_file):
    nom_names = load_nom_dict(nom_dict_file)
    all_mentions = Mention.load_edl_file(edl_file)
    nil_mentions = [m for m in all_mentions if m.kbid.startswith('NIL') and m.name.lower() not in nom_names]
    kbid_mentions = __group_mentions_by_kbid(nil_mentions)

    new_kbids, new_mentions_kbids = list(), list()
    for kbid, mentions in kbid_mentions.iteritems():
        merged = False
        for nkbid, nmentions in izip(new_kbids, new_mentions_kbids):
            if __should_merge(mentions, nmentions):
                # for m in mentions:
                #     print '%s\t' % m.name,
                # print
                # for m in nmentions:
                #     print '%s\t' % m.name,
                # print '\n'

                for m in mentions:
                    m.kbid = nkbid
                    nmentions.append(m)
                merged = True
                break

        if not merged:
            new_kbids.append(kbid)
            new_mentions_kbids.append(mentions)

    Mention.save_as_edl_file(all_mentions, dst_file)
Exemplo n.º 2
0
def prev_mentions_format_to_new(tab_file, xml_file, output_file):
    xml_text = __read_text_file(xml_file)
    miter = re.finditer(xml_mention_pattern_str, xml_text)
    mentions_dict = dict()
    beg_pos_dict = dict()
    for m in miter:
        cur_doc_id = m.group(3)
        mention = Mention(name=m.group(2), docid=cur_doc_id, mention_id=m.group(1))
        doc_beg = beg_pos_dict.get(cur_doc_id, 0)  # TODO
        mention.beg_pos = doc_beg
        mention.end_pos = doc_beg + len(mention.name.encode('utf-8')) - 1
        beg_pos_dict[cur_doc_id] = mention.end_pos + 1
        mentions_dict[mention.mention_id] = mention

    f = open(tab_file, 'r')
    for line in f:
        vals = line.strip().split('\t')
        if len(vals) < 3:
            continue
        m = mentions_dict.get(vals[0], None)
        if m:
            m.kbid = vals[1]
            m.entity_type = vals[2]
    f.close()

    Mention.save_as_edl_file(mentions_dict.values(), output_file)
Exemplo n.º 3
0
def clean_ner_result(result_file):
    ord_mention_list = list()
    med_mention_list = list()

    fin = open(result_file, 'rb')
    for line in fin:
        line = line.strip()
        if len(line) == 0:
            continue

        vals = line.strip().split('\t')
        # TODO
        if vals[3] == 'Disease' or vals[3] == 'Chemical':
            span = (int(vals[0]), int(vals[1]) - 1)
        else:
            span = (int(vals[0]), int(vals[1]))
        mention = Mention()
        mention.span = span
        mention.mtype = vals[3]
        if len(vals) == 4:
            ord_mention_list.append(mention)
        else:
            if vals[4].startswith('MESH'):
                mention.mesh_id = vals[4][5:]
            elif vals[4].startswith('CHEBI'):
                mention.chebi_id = int(vals[4][6:])
            med_mention_list.append(mention)
    fin.close()

    merged_mention_list = list()
    Mention.merge_mention_list(med_mention_list, merged_mention_list)
    Mention.merge_mention_list(ord_mention_list, merged_mention_list)

    return merged_mention_list
Exemplo n.º 4
0
def main():
    # dataset = 'LDC2015E75'
    dataset = 'LDC2015E103'
    # dataset = 'LDC2016E63'
    mentions_tag = '0'
    run_id = 4

    # datadir = '/home/dhl/data/EDL/'
    datadir = 'e:/data/edl'

    doc_list_file = os.path.join(datadir, dataset, 'data/eng-docs-list-win.txt')
    mid_type_file = os.path.join(datadir, 'res/freebase/mid-entity-type.txt')
    cur_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-%s.tab' % mentions_tag)
    miss_match_mentions_file = os.path.join(datadir, dataset, 'output/miss-match-mentions-%s.txt' % mentions_tag)
    new_edl_file = os.path.join(datadir, dataset, 'output/sys-link-sm-pp-ft-%d.tab' % run_id)
    # __nil_clustering(nom_dict_file, edl_file, dst_file)
    mentions = Mention.load_edl_file(cur_edl_file)

    # __link_nom(doc_mentions_dict, max_nil_id)

    __nil_author_clustering(mentions)
    __fix_special_types(mentions)
    __fix_entity_types_by_mid(mid_type_file, mentions)
    # __fix_type_diff_of_same_kbid(mentions)
    __validate_mentions(doc_list_file, mentions, miss_match_mentions_file)
    __fix_pos_error(mentions)
    Mention.save_as_edl_file(mentions, new_edl_file, runid='WednesdayGo%d' % run_id)
Exemplo n.º 5
0
def __expand_location_names(mentions, tokenized_text_file,
                            entity_candidates_dict_file):
    doc_mentions_dict = Mention.group_mentions_by_docid(mentions)

    expansion_candidates = []
    f = open(tokenized_text_file, 'r')
    for line in f:
        vals = line.strip().split('\t')
        docid = vals[0]
        # print docid
        num_lines = int(vals[1])
        doc_mentions = doc_mentions_dict[docid]
        # print len(mentions)
        for i in xrange(num_lines):
            line = f.next().decode('utf-8')
            words = line.strip().split(' ')
            expansion_candidates += __find_expansion_candidates_in_location_mentions(
                doc_mentions, words)
            # break
        # break
    f.close()

    expansion_dict = __filter_expansion_candidates(
        expansion_candidates, entity_candidates_dict_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    for qid, mention in qid_mentions.iteritems():
        exp_name = expansion_dict.get(qid, '')
        if not exp_name:
            continue
        print '%s\t%s\t%s' % (qid, mention.name, exp_name)
        mention.name = exp_name
Exemplo n.º 6
0
def __apply_coref(edl_file, linking_info_file, dst_edl_file):
    coref_dict = dict()
    f = open(linking_info_file, 'rb')
    while True:
        docid = ioutils.read_str_with_byte_len(f)
        if not docid:
            break
        num_mentions = np.fromfile(f, '>i4', 1)
        is_nested = np.fromfile(f, 'b', num_mentions)
        corefs = np.fromfile(f, '>i4', num_mentions)
        qids = list()
        for i in xrange(num_mentions):
            qid = __read_mention_from_linking_info_file(f)
            qids.append(qid)
        for coref_id, qid in izip(corefs, qids):
            if coref_id > 0:
                coref_dict[qid] = qids[coref_id]
    f.close()

    mentions = Mention.load_edl_file(edl_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    __assgin_different_id_to_all_nils(mentions)
    print qid_mentions['EDL14_ENG_0052'].kbid
    for m in mentions:
        if not m.kbid.startswith('NIL'):
            continue
        coref_qid = coref_dict.get(m.mention_id, '')
        if coref_qid:
            print m.mention_id, coref_qid, m.name, qid_mentions[coref_qid].kbid
            m.kbid = qid_mentions[coref_qid].kbid

    Mention.save_as_edl_file(mentions, dst_edl_file)
Exemplo n.º 7
0
    def link_text(self, text, mention_detection_result):
        result_dict = dict()

        mesh_mention_list = self.__find_mesh_mentions(text)
        merged_mention_list = list()
        Mention.merge_mention_list(mention_detection_result,
                                   merged_mention_list)
        Mention.merge_mention_list(mesh_mention_list, merged_mention_list)

        self.__link_mention_to_wiki(text, merged_mention_list)

        mesh_idx_dict, wiki_idx_dict, chebi_idx_dict, idx_list = MedLink.__asign_indices(
            merged_mention_list)

        result_dict['entities'] = entities_dict = dict()
        self.__add_wiki_mention_info(wiki_idx_dict, entities_dict)
        self.__add_mesh_mention_info(mesh_idx_dict, entities_dict)
        self.__add_chebi_mention_info(chebi_idx_dict, entities_dict)

        result_span_list = list()
        mention_type_list = list()
        for mention in merged_mention_list:
            result_span_list.append(mention.span)
            mention_type_list.append(mention.mtype)
        result_dict['spans'] = result_span_list
        result_dict['idx'] = idx_list
        result_dict['type'] = mention_type_list

        return json.dumps(result_dict, indent=2)
Exemplo n.º 8
0
def __list_errors():
    gold_edl_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab'
    sys_edl_file = 'e:/data/el/LDC2015E20/data/eval/output/emadr-result-coref.tab'
    eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt'

    eid_wid_dict = load_eid_wid_file(eid_wid_file)
    gold_mentions = Mention.load_edl_file(gold_edl_file)
    gold_qid_mentions = Mention.group_mentions_by_qid(gold_mentions)
    sys_mentions = Mention.load_edl_file(sys_edl_file)
    sys_qid_mentions = Mention.group_mentions_by_qid(sys_mentions)

    for qid, mention in gold_qid_mentions.iteritems():
        sys_mention = sys_qid_mentions[qid]
        if sys_mention.kbid == mention.kbid:
            continue
        if sys_mention.kbid.startswith('NIL') and mention.kbid.startswith(
                'NIL'):
            continue
        if mention.kbid.startswith('NIL'):
            continue
        wid_gold = eid_wid_dict.get(mention.kbid, -1)
        wid_sys = eid_wid_dict.get(sys_mention.kbid, -1)
        print '%s\t%s\t%s\t%s\t%d\t%d\t%s' % (
            qid, mention.kbid, sys_mention.kbid, mention.docid,
            mention.beg_pos, mention.end_pos, mention.name)
        print wid_gold, wid_sys
Exemplo n.º 9
0
def __save_link_result(edl_file, result_triv, qids, kbids_list, y_pred,
                       max_scores, dst_file, use_nil_thres):
    mentions = Mention.load_edl_file(edl_file)
    for m in mentions:
        m.kbid = 'NODEF'

    qid_mentions = Mention.group_mentions_by_qid(mentions)
    for qid, kbid in result_triv.iteritems():
        qid_mentions[qid].kbid = kbid
        # print qid, kbid

    for qid, kbids, y, max_score in izip(qids, kbids_list, y_pred, max_scores):
        if y >= len(kbids):
            print y, len(kbids)
        if qid_mentions[qid].kbid == 'NODEF':
            if use_nil_thres and max_score < 0.5:
                qid_mentions[qid].kbid = 'NIL'
            else:
                qid_mentions[qid].kbid = kbids[y]
            # print qid, kbids[y]

    for m in mentions:
        if m.kbid.startswith('m.') or m.kbid.startswith('NIL'):
            m.kbid = 'NIL0001'

    Mention.save_as_edl_file(mentions, dst_file)
Exemplo n.º 10
0
def __remove_leading_the(metions_file, dst_mentions_edl_file):
    mentions = Mention.load_edl_file(metions_file)
    for m in mentions:
        if m.name.startswith('the '):
            m.name = m.name[4:]
            m.beg_pos += 4

    Mention.save_as_edl_file(mentions, dst_mentions_edl_file)
Exemplo n.º 11
0
def __evaluate_edl(gold_edl_file, sys_edl_file, require_type_match, link_error_file,
                   type_error_file):
    gold_mentions = Mention.load_edl_file(gold_edl_file, arrange_by_docid=True)
    sys_mentions = Mention.load_edl_file(sys_edl_file, arrange_by_docid=True)

    link_errors = list()
    type_errors = list()
    sys_cnt, gold_cnt, hit_cnt = 0, 0, 0
    for docid, sys_mentions_doc in sys_mentions.iteritems():
        gold_mentions_doc = gold_mentions.get(docid, list())
        for gm in gold_mentions_doc:
            if not gm.kbid.startswith('NIL'):
                gold_cnt += 1

        hit_list = [False for _ in xrange(len(gold_mentions_doc))]
        for sm in sys_mentions_doc:
            for i, gm in enumerate(gold_mentions_doc):
                if sm.beg_pos == gm.beg_pos and sm.end_pos == gm.end_pos:
                    hit_list[i] = True
                    break

            if sm.kbid.startswith('NIL'):
                continue
            sys_cnt += 1
            for i, gm in enumerate(gold_mentions_doc):
                if sm.beg_pos != gm.beg_pos or sm.end_pos != gm.end_pos:
                    continue

                if gm.mention_type == 'NOM':
                    sys_cnt -= 1
                    break

                if sm.entity_type != gm.entity_type:
                    type_errors.append((gm, sm))

                if sm.kbid == gm.kbid and ((not require_type_match) or sm.entity_type == gm.entity_type):
                    hit_cnt += 1

                if sm.kbid != gm.kbid:
                    link_errors.append((gm, sm))
                    # print '%s\t%s\t%s\t%s' % (docid, sm.mid, gm.mid, gm.name)
                    # print sm.mid, gm.mid, gm.name, docid

    link_errors.sort(key=lambda x: x[0].name)
    __write_link_errors(link_errors, link_error_file)
    type_errors.sort(key=lambda x: x[0].name)
    __write_type_errors(type_errors, type_error_file)
    # for v in errors:
    #     print '%s\t%s\t%s\t%s' % (v[0], v[1], v[2], v[3])

    print '#hit: %d, #sys: %d, #gold: %d' % (hit_cnt, sys_cnt, gold_cnt)
    hit_cnt = float(hit_cnt)
    prec = hit_cnt / sys_cnt
    recall = hit_cnt / gold_cnt
    f1 = 2 * prec * recall / (prec + recall)
    print 'prec: %f, recall: %f, f1: %f' % (prec, recall, f1)
Exemplo n.º 12
0
 def __find_mesh_mentions(self, text):
     mesh_spans, mesh_ids = self.mesh_match.find_all_terms(text)
     mention_list = list()
     for mesh_span, mesh_id in izip(mesh_spans, mesh_ids):
         mention = Mention()
         mention.span = mesh_span
         mention.mtype = 'MISC'
         mention.mesh_id = mesh_id
         mention_list.append(mention)
     return mention_list
Exemplo n.º 13
0
def __find_type_errors(gold_edl_file, sys_edl_file):
    gold_mentions_docs = Mention.load_edl_file(gold_edl_file, arrange_by_docid=True)
    sys_mentions_docs = Mention.load_edl_file(sys_edl_file, arrange_by_docid=True)

    all_errors = list()
    for docid, sys_mentions in sys_mentions_docs.iteritems():
        gold_mentions = gold_mentions_docs[docid]
        all_errors += __find_type_errors_of_docs(docid, gold_mentions, sys_mentions)
    all_errors.sort(key=lambda x: x[0].name.lower())
    for v in all_errors:
        print '%s\t%s\t%s\t%s' % (v[0].name, v[0].entity_type, v[1].entity_type, v[0].docid)
Exemplo n.º 14
0
def __extract_nom_mentions(nom_dict_file, doc_list_file, words_pos_file, dst_nom_mentions_file):
    noms = load_nom_dict(nom_dict_file)
    nom_name_list = [n for n in noms]
    nom_name_list.sort(key=lambda x: -len(x))
    nom_name_list = [n.split(' ') for n in nom_name_list]

    doc_path_dict = __load_doc_paths_as_dict(doc_list_file)

    mentions = list()
    f_wp = open(words_pos_file, 'r')
    for i, line in enumerate(f_wp):
        vals = line.rstrip().split('\t')
        docid = vals[0]

        if (i + 1) % 10 == 0:
            print i + 1, docid

        doc_path = doc_path_dict[docid]
        doc_text = read_text(doc_path).decode('utf-8')
        if doc_text.startswith(doc_head):
            doc_text = doc_text[len(doc_head):]

        num_sentences = int(vals[1])
        for j in xrange(num_sentences):
            sentence = __next_sentence_in_words_pos_file(f_wp)
            words = [tup[0].lower() for tup in sentence]
            # print words
            hit_spans, hit_indices = find_phrases_in_words(nom_name_list, words, False)
            for hit_span, hit_idx in izip(hit_spans, hit_indices):
                beg_pos = sentence[hit_span[0]][3]
                end_pos = sentence[hit_span[1] - 1][4]

                tags = [tup[2] for tup in sentence[hit_span[0]:hit_span[1]]]
                # print tags
                # if 'NN' not in tags and 'NNP' not in tags:
                #     continue
                if 'NN' not in tags:
                    continue

                name = doc_text[beg_pos:end_pos + 1].replace('\n', ' ')
                if '&lt;' in name or 'http:' in name or '&gt;' in name:
                    continue
                m = Mention(name=name, beg_pos=beg_pos, end_pos=end_pos, docid=docid, mention_type='NOM',
                            entity_type='PER', kbid='NIL00000')
                mentions.append(m)
                # print sentence[hit_span[0]], sentence[hit_span[1]]
                # print nom_name_list[hit_idx], name
        # break
    f_wp.close()

    Mention.save_as_edl_file(mentions, dst_nom_mentions_file)
Exemplo n.º 15
0
def __gen_ttl_dict():
    edl_file = '/home/dhl/data/EDL/LDC2015E103/data/gold-eng-mentions.tab'
    dst_file = '/home/dhl/data/EDL/LDC2015E75/data/ttl-dict.txt'
    mentions = Mention.load_edl_file(edl_file)
    for m in mentions:
        if m.entity_type == 'TTL':
            print m.name, m.entity_type, m.mention_type, m.docid
Exemplo n.º 16
0
    def _decode(mention_json):
        """
        Decode a json string of a sentence.
        e.g.,  {"senid":40,
                "mentions":[{"start":0,"end":2,"labels":["/person"]},
                            {"start":6,"end":8,"labels":["/location/city","/location"]}],
                "tokens":["Raymond","Jung",",","51",",","of","Federal","Way",";",
                         "accused","of","leasing","apartments","where","the","women",
                         "were","housed","."],
                "fileid":""}
        :param mention_json: string
        :return: a sentence instance with all mentions appearing in this sentence
        """
        if mention_json == '':
            return None
        decoded = json.loads(mention_json)
        sentence = Sentence(decoded['fileid'], decoded['senid'], decoded['tokens'])
        for m in decoded['mentions']:
		sentence.add_mention(Mention(int(m['start']), int(m['end']), m['labels']," ".join(decoded['tokens'][m['start']:m['end']])))
        if 'pos' in decoded:
            sentence.pos = decoded['pos']
        if 'dep' in decoded:
            for dep in decoded['dep']:
                sentence.dep.append((dep['type'], dep['gov'], dep['dep']))
        return sentence
Exemplo n.º 17
0
    def from_coref(cls, coref):
        check_type(coref, document.Coreference)

        mentions = [
            Mention.from_mention(mention) for mention in coref.mentions
        ]
        return cls(mentions)
Exemplo n.º 18
0
def __type_eval():
    tac_edl_file = 'e:/el/LDC2015E75/data/tac_kbp_2015_tedl_training_gold_fixed.tab'
    mid_type_file = 'e:/el/res/freebase/mid-entity-type.txt'

    mid_type_dict = dict()
    f = open(mid_type_file, 'r')
    for line in f:
        vals = line[:-1].split('\t')
        mid_type_dict[vals[0]] = vals[1]
    f.close()

    hitcnt, cnt = 0, 0
    mentions = Mention.load_edl_file(tac_edl_file)
    for m in mentions:
        if not m.mid.startswith('m.'):
            continue
        # print m.mid
        cnt += 1
        sys_type = mid_type_dict.get(m.mid[2:], 'ORG')
        if sys_type == m.entity_type:
            hitcnt += 1
        else:
            print m.mid, m.entity_type, sys_type
    print hitcnt, cnt
    print float(hitcnt) / cnt
Exemplo n.º 19
0
def __validate_mentions(doc_list_file, mentions, dst_miss_match_file):
    print 'checking miss match'
    doc_mentions = Mention.arrange_mentions_by_docid(mentions)
    doc_paths = load_doc_paths(doc_list_file)
    doc_head = '<?xml version="1.0" encoding="utf-8"?>\n'
    miss_match_cnt = 0
    fout = open(dst_miss_match_file, 'wb')
    for doc_path in doc_paths:
        docid = doc_id_from_path(doc_path)
        cur_doc_mentions = doc_mentions.get(docid, list())
        if not cur_doc_mentions:
            continue

        doc_text = read_text(doc_path, True)
        if doc_text.startswith(doc_head):
            doc_text = doc_text[len(doc_head):]

        for m in cur_doc_mentions:
            name_in_doc = doc_text[m.beg_pos:m.end_pos + 1]
            if name_in_doc != m.name:
                miss_match_cnt += 1
                fout.write('%s\t%s\t%d\t%d\t%s\n' % (docid, m.name.encode('utf-8'), m.beg_pos, m.end_pos,
                                                     name_in_doc.encode('utf-8')))
                # print '%s\t%s\t%d\t%d\t%s' % (docid, m.name, m.beg_pos, m.end_pos, name_in_doc)
    fout.close()
    print miss_match_cnt, 'miss match'
Exemplo n.º 20
0
    def collect_mentions(self):
        mention_ids = defaultdict(list)

        def get_start_ids(cr):
            return [
                int(x.replace(')', '').replace('(', '')) for x in cr.split('|')
                if x.startswith('(')
            ]

        def get_end_ids(cr):
            return [
                int(x.replace(')', '').replace('(', '')) for x in cr.split('|')
                if x.endswith(')')
            ]

        starts = [(i, t) for (i, t) in enumerate(self.tokens)
                  if t.coref.find('(') > -1]
        starts.reverse()
        ends = [(i, t) for (i, t) in enumerate(self.tokens)
                if t.coref.find(')') > -1]

        for s in starts:
            ids = get_start_ids(s[1].coref)
            for i in ids:
                mention_ids[i].append(s)

        for e in ends:
            ids = get_end_ids(e[1].coref)
            for i in ids:
                s = mention_ids[i].pop()
                self.mentions.append(
                    Mention(self.tokens[s[0]:e[0] + 1], self.sentenceID,
                            (s[0], e[0]), i))
Exemplo n.º 21
0
def __check_mention_fb_types():
    tac_edl_file = 'e:/el/LDC2015E75/data/tac_kbp_2015_tedl_training_gold_fixed.tab'
    fb_type_file = 'e:/el/res/freebase/mid-fb-type.gz'
    result_file = 'e:/el/LDC2015E75/data/mention-fb-types.txt'

    mentions = Mention.load_edl_file(tac_edl_file)
    mid_mentions = dict()
    for m in mentions:
        if m.mid.startswith('NIL'):
            continue
        mid_mentions[m.mid[2:]] = m

    f = gzip.open(fb_type_file, 'r')
    fout = open(result_file, 'wb')
    for i, line in enumerate(f):
        vals = line[:-1].split('\t')
        m = mid_mentions.get(vals[0], None)
        if m:
            # print '%s\t%s\t%s\t%s' % (m.name, vals[0], m.entity_type, vals[1])
            fout.write('%s\t%s\t%s\t%s\n' % (m.name.encode('utf-8'), vals[0], m.entity_type, vals[1]))

        if (i + 1) % 1000000 == 0:
            print i + 1
    f.close()
    fout.close()
Exemplo n.º 22
0
def __gold_mention_insight():
    edl_gold_file = 'e:/el/LDC2015E103/data/tac_kbp_2015_tedl_evaluation_gold_standard_entity_mentions.tab'
    mentions = Mention.load_edl_file(edl_gold_file)
    doc_mention_dict = dict()
    for m in mentions:
        if m.docid.startswith('ENG'):
            mlist = doc_mention_dict.get(m.docid, list())
            if not mlist:
                doc_mention_dict[m.docid] = mlist
            mlist.append(m)

    cnt, fncnt = 0, 0
    for docid, doc_mentions in doc_mention_dict.iteritems():
        print docid
        for m0 in doc_mentions:
            if m0.entity_type == 'PER' and ' ' in m0.name:
                fncnt += 1
            for m1 in doc_mentions:
                if m0 == m1:
                    continue
                if m0.beg_pos <= m1.beg_pos and m0.end_pos >= m1.end_pos and m0.entity_type == 'PER':
                    print '\t%s\t%d\t%d' % (m0.name, m0.beg_pos, m0.end_pos)
                    print '\t%s\t%d\t%d' % (m1.name, m1.beg_pos, m1.end_pos)
                    cnt += 1
                    # print m0.name, m0.beg_pos, m0.end_pos
                    # print m1.name, m1.beg_pos, m1.end_pos

    print cnt, fncnt
Exemplo n.º 23
0
def __evaluate_ed(gold_edl_file, sys_edl_file, fn_file, fp_file, require_type_match=True):
    gold_mentions = Mention.load_edl_file(gold_edl_file, arrange_by_docid=True)
    sys_mentions = Mention.load_edl_file(sys_edl_file, arrange_by_docid=True)

    fout_fp = open(fp_file, 'wb')
    sys_cnt, gold_cnt, hit_cnt = 0, 0, 0
    fn_mentions = list()
    for docid, sys_mentions_doc in sys_mentions.iteritems():
        sys_cnt += len(sys_mentions_doc)

        all_gold_mentions_in_doc = gold_mentions.get(docid, list())
        # nam_gold_mentions = [m for m in all_gold_mentions_in_doc if m.mention_type == 'NAM']
        nam_gold_mentions = all_gold_mentions_in_doc
        gold_hit_tags = [False] * len(nam_gold_mentions)
        gold_cnt += len(nam_gold_mentions)

        for sm in sys_mentions_doc:
            hit = False
            for i, gm in enumerate(nam_gold_mentions):
                type_hit = (sm.entity_type.startswith(gm.entity_type)) if require_type_match else True
                if sm.beg_pos == gm.beg_pos and sm.end_pos == gm.end_pos and type_hit:
                    hit = True
                    hit_cnt += 1
                    gold_hit_tags[i] = True
                    break

            if not hit:
                fout_fp.write('%s\t%s\t%d\t%d\n' % (sm.name.encode('utf-8'), docid, sm.beg_pos, sm.end_pos))
        # break

        for gm, hit in izip(nam_gold_mentions, gold_hit_tags):
            if not hit:
                fn_mentions.append(gm)
                # fout_fn.write('%s\t%s\t%d\t%d\n' % (gm.name.encode('utf-8'), docid, gm.beg_pos, gm.end_pos))

    fout_fp.close()

    fn_mentions.sort(key=lambda x: x.name)
    Mention.write_mentions(fn_mentions, fn_file)

    print '#hit: %d, #sys: %d, #gold: %d' % (hit_cnt, sys_cnt, gold_cnt)
    hit_cnt = float(hit_cnt)
    prec = hit_cnt / sys_cnt
    recall = hit_cnt / gold_cnt
    f1 = 2 * prec * recall / (prec + recall)
    print 'prec: %f, recall: %f, f1: %f' % (prec, recall, f1)
Exemplo n.º 24
0
def link():
    print 'beg init'
    med_link = init_model()
    curtext = '“That\'s a growth rate of 6,000 times over three years,” touts Turner.'
    m = Mention(span=(0, 4), mtype='PER')
    mentions = [m]
    lr = med_link.link_mentions(mentions, curtext)
    print __mentions_to_dict_list(lr)
Exemplo n.º 25
0
 def end(self, tag):
     self.tag = ''
     if tag == 'sentences':
         if self.parse_sent:
             self.parse_sent = False
     elif tag == 'sentence':
         if self.parse_sent:
             if self.sent is not None:
                 self.sents.append(deepcopy(self.sent))
                 self.sent = None
     elif tag == 'token':
         # map corenlp ner tags to coerse grained ner tags
         token = Token(self.word,
                       self.lemma,
                       self.pos,
                       ner=convert_corenlp_ner_tag(self.ner))
         self.sent.add_token(deepcopy(token))
         self.word = ''
         self.lemma = ''
         self.pos = ''
         self.ner = ''
     elif tag == 'dependencies':
         if self.parse_dep:
             self.parse_dep = False
     elif tag == 'dep':
         if self.parse_dep:
             if not self.copied_dep:
                 if self.dep_label != 'root':
                     dep = Dependency(self.dep_label, self.gov_idx,
                                      self.dep_idx, self.extra)
                     self.sent.add_dep(deepcopy(dep))
             else:
                 self.copied_dep = False
             self.dep_label = ''
             self.gov_idx = -1
             self.dep_idx = -1
             self.extra = False
     elif tag == 'coreference':
         if self.parse_coref:
             if self.coref is not None:
                 self.corefs.append(deepcopy(self.coref))
                 self.coref = None
             else:
                 self.parse_coref = False
     elif tag == 'mention':
         mention = Mention(self.sent_idx,
                           self.start_token_idx,
                           self.end_token_idx,
                           head_token_idx=self.head_token_idx,
                           rep=self.rep,
                           text=self.text.encode('ascii', 'ignore'))
         self.coref.add_mention(deepcopy(mention))
         self.sent_idx = -1
         self.start_token_idx = -1
         self.end_token_idx = -1
         self.head_token_idx = -1
         self.rep = False
         self.text = ''
Exemplo n.º 26
0
def get_cand_mentions(corpus, limit=5, check=False):
    """
    :param corpus: 1D: n_doc, 2D: n_sents, 3D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref_id)
    :return: cand: 1D: n_doc, 2D: n_sents, 3D: n_mentions; elem=Mention
    """
    cand_ments = []
    count = 0.
    max_span_len = -1
    total_span_len = 0.

    for doc_i, doc in enumerate(corpus):
        doc_ments = []

        for sent_i, sent in enumerate(doc):
            mention_spans = []
            """ Extracting NP, Pro-Nom, NE mentions """
            mention_spans.extend(get_np(sent))
            mention_spans.extend(get_pronominals(sent))
            mention_spans.extend(get_ne(sent))
            """ Removing duplicates, and sorting """
            mention_spans = list(set(mention_spans))
            mention_spans.sort()

            tmp_ments = []
            for span in mention_spans:
                span_len = span[1] - span[0] + 1

                if span_len <= limit:
                    tmp_ments.append(Mention(doc_i, sent_i, span))

                    if span_len > max_span_len:
                        max_span_len = span_len
                    total_span_len += span_len

            doc_ments.append(tmp_ments)
            count += len(tmp_ments)

        cand_ments.append(doc_ments)

    print 'Cand Mentions: %d  Max Span Length: %d  Avg. Span Length: %f' % (
        count, max_span_len, total_span_len / count)

    if check:
        with open('cand_mentions.txt', 'w') as f:
            for doc, doc_ments in zip(corpus, cand_ments):
                for sent, sent_ments in zip(doc, doc_ments):
                    for ment in sent_ments:
                        print >> f, '%s' % str(ment.span)
                    print >> f

                    for sent_i, w in enumerate(sent):
                        print >> f, '%d\t%s\t%s' % (sent_i,
                                                    w[2].encode('utf-8'),
                                                    w[-1].encode('utf-8'))
                    print >> f

    return cand_ments
Exemplo n.º 27
0
def __compare_mentions():
    datadir = 'e:/data/edl'
    edl_file0 = '%s/LDC2016E63/output/ner-mentions-0.tab' % datadir
    edl_file1 = '%s/LDC2016E63/output/ner-mentions-1.tab' % datadir
    mentions0 = Mention.load_edl_file(edl_file0, True)
    mentions1 = Mention.load_edl_file(edl_file1, True)

    for docid, doc_mentions1 in mentions1.iteritems():
        print docid
        doc_mentions0 = mentions0.get(docid, list())
        for m1 in doc_mentions1:
            found = False
            for m0 in doc_mentions0:
                if m0.beg_pos == m1.beg_pos and m0.end_pos == m1.end_pos:
                    found = True
                    break
            if not found:
                print '\t%s\t%d\t%d' % (m1.name, m1.beg_pos, m1.end_pos)
Exemplo n.º 28
0
def __build_training_data(qid_x_list, edl_file):
    mentions = Mention.load_edl_file(edl_file)
    qid_mentions = Mention.group_mentions_by_qid(mentions)
    train_x = list()
    train_y = list()
    for tup in qid_x_list:
        qid, kbid, first_candidate, commonness, dist = tup
        # print qid, kbid, first_candidate, commonness, dist
        m = qid_mentions[qid]

        if (not m.kbid.startswith('NIL')) and m.kbid != kbid:
            continue

        y = 0 if m.kbid.startswith('NIL') else 1
        # train_x.append([first_candidate, commonness, dist])
        train_x.append([first_candidate, commonness])
        # train_x.append([first_candidate])
        train_y.append(y)
    return train_x, train_y
Exemplo n.º 29
0
 def read_mentions(self):
     if not os.path.isfile(self.mentions_path):
         return
     with open(self.mentions_path) as f:
         for line in f:
             m = Mention.from_string(line)
             if not m.private:
                 self.mentions[m.target].append(m)
             else:
                 self.private_mentions[m.target].append(m)
Exemplo n.º 30
0
def __gen_training_data(edl_file):
    mentions = Mention.load_edl_file(edl_file)
    nil_mentions = __get_nil_mentions(mentions)
    kbid_mentions = Mention.group_mentions_by_kbid(nil_mentions)
    pos_samples = __gen_positive_samples(kbid_mentions)
    neg_samples = __gen_neg_samples(kbid_mentions, len(pos_samples))

    data_x = list()
    data_y = list()
    all_samples = __merge_samples(pos_samples, neg_samples)
    for sample, y, in all_samples:
        sample_x = __get_features(sample)
        data_x.append(sample_x)
        data_y.append(y)
        # print

    for x, y in izip(data_x, data_y):
        print x, y

    return data_x, data_y
Exemplo n.º 31
0
def __get_mid_types_in_dataset():
    datadir = 'e:/data/edl'
    edl_file = os.path.join(datadir, 'LDC2015E75/data/gold-eng-mentions.tab')
    mid_types_file = os.path.join(datadir, 'res/freebase/mid-fb-type.gz')
    dst_file = os.path.join(datadir, 'LDC2015E75/output/fb-types.txt')

    mentions = Mention.load_edl_file(edl_file)
    for m in mentions:
        if m.kbid.startswith('m.'):
            m.kbid = m.kbid[2:]
    kbid_mentions = Mention.group_mentions_by_kbid(mentions)

    f = gzip.open(mid_types_file, 'r')
    fout = open(dst_file, 'wb')
    hit = False
    prev_kbid = ''
    for i, line in enumerate(f):
        tab_pos = line.find('\t')
        kbid = line[:tab_pos]

        if hit and prev_kbid == kbid:
            fout.write('\t%s' % line)
        elif prev_kbid != kbid:
            if kbid in kbid_mentions:
                cur_mentions = kbid_mentions[kbid]
                for m in cur_mentions:
                    fout.write('%s\t' % m.name.encode('utf-8'))
                fout.write('\n')
                for m in cur_mentions:
                    fout.write('%s\t' % m.entity_type)
                fout.write('\n\t%s' % line)
                hit = True
            else:
                hit = False

        prev_kbid = kbid

        if (i + 1) % 10000000 == 0:
            print i + 1
    f.close()
    fout.close()
Exemplo n.º 32
0
    def link_mentions_info(self, text, mention_detection_result, find_mesh_mentions_by_dict=False):
        merged_mention_list = list()
        if find_mesh_mentions_by_dict:
            mesh_mention_list = self.__find_mesh_mentions(text)
            Mention.merge_mention_list(mention_detection_result, merged_mention_list)
            Mention.merge_mention_list(mesh_mention_list, merged_mention_list)

        linked_mentions = self.link_mentions(merged_mention_list, text)

        if find_mesh_mentions_by_dict:
            for mention in merged_mention_list:
                if mention.mesh_id or mention.chebi_id > -1:
                    for mention1 in merged_mention_list:
                        if mention.name.lower() == mention1.name.lower():
                            mention1.mesh_id = mention.mesh_id
                            mention1.chebi_id = mention.chebi_id

        mesh_idx_dict, wiki_idx_dict, chebi_idx_dict, idx_list = MedLink.__asign_indices(linked_mentions)
        # print wiki_idx_dict

        result_dict = dict()
        result_dict['entities'] = entities_dict = dict()
        self.__add_wiki_mention_info(wiki_idx_dict, entities_dict)
        self.__add_mesh_mention_info(mesh_idx_dict, entities_dict)
        self.__add_chebi_mention_info(chebi_idx_dict, entities_dict)

        result_span_list = list()
        mention_type_list = list()
        for mention in linked_mentions:
            result_span_list.append(mention.span)
            mention_type_list.append(mention.mtype)

        self.__fix_types(mesh_idx_dict, idx_list, mention_type_list)

        result_dict['spans'] = result_span_list
        result_dict['idx'] = idx_list
        result_dict['type'] = mention_type_list

        return json.dumps(result_dict, indent=2)
Exemplo n.º 33
0
def get_gold_ments(doc_i, sent_i, sent):
    """
    :param sent: 1D: n_words; elem=(doc_id, part_id, word, tag, syn, ne, coref)
    :return: ments: 1D: n_mentions: elem=Mention
    """

    ments = []
    prev = []

    for i, w in enumerate(sent):
        mentions = w[6].split('|')

        for mention in mentions:
            if mention.startswith('('):
                if mention.endswith(')'):
                    span = (i, i)
                    coref_id = int(mention[1:-1])
                    ments.append(Mention(doc_i, sent_i, span, coref_id))
                else:
                    coref_id = int(mention[1:])
                    prev.append(((i, i), coref_id))
            else:
                if mention.endswith(')'):
                    coref_id = int(mention[:-1])

                    for j, p in enumerate(prev):
                        if coref_id == p[1]:
                            span = (p[0][0], i)
                            ments.append(Mention(doc_i, sent_i, span,
                                                 coref_id))
                            prev.pop(j)
                            break
                    else:
                        print 'Error at extract_mentions(): %s' % str(sent)
                        exit()

    assert len(prev) == 0
    return ments
Exemplo n.º 34
0
def __missing_docs_in_edl_file():
    datadir = 'e:/data/edl'
    edl_file = '%s/LDC2016E63/output/all-mentions.tab' % datadir
    doc_list_file = '%s/LDC2016E63/data/eng-docs-list-win.txt' % datadir

    mentions = Mention.load_edl_file(edl_file)
    docids = set()
    for m in mentions:
        docids.add(m.docid)

    f = open(doc_list_file, 'r')
    for line in f:
        doc_path = line.rstrip()
        docid = doc_id_from_path(doc_path)
        if docid not in docids:
            print docid
    f.close()
Exemplo n.º 35
0
def get_review_for_user(username, user_rev_idx):
    if user_rev_idx < 1:
        user_rev_idx = 1

    received = __query_review_dispatcher(username, user_rev_idx)

    rev_idx = received['review_idx']
    rev_id = received['review_id']
    if rev_id == 'NULL':
        return None

    res = es.get(index=index_name, doc_type=rev_doc_type, id=rev_id)

    mention_dicts = received['mentions']
    mentions = [Mention.from_dict(mdict) for mdict in mention_dicts]
    mentions.sort(key=lambda m: m.begpos)

    return rev_idx, res['_source'], mentions
Exemplo n.º 36
0
def edl_api():
    doc_text = ''
    if 'text' in request.values:
        doc_text = request.values['text']
        # print doc_text
        # print type(doc_text)
    else:
        abort(400)

    json_result = '[]'
    try:
        mentions_list = list()
        mentions_dict = mention_extraction_web(doc_text)
        for result_type, mentions in mentions_dict.items():
            entity_type = 'MISC'
            if result_type == 'results_Disease':
                entity_type = 'Disease'
            elif result_type == 'results_Chemical':
                entity_type = 'Chemical'

            for dict_mention in mentions:
                beg_pos = dict_mention['startChar']
                end_pos = dict_mention['endChar']
                meshid = None
                specified_type = dict_mention.get('label', None)
                if specified_type:
                    entity_type = specified_type
                # print dict_mention
                # print beg_pos, end_pos, entity_type, meshid
                m = Mention(span=(beg_pos, end_pos),
                            mtype=entity_type,
                            mesh_id=meshid)
                mentions_list.append(m)
        # linked_mentions = med_link.link_mentions(mentions_list, doc_text.decode('utf-8'))
        linked_mentions = med_link.link_mentions(mentions_list, doc_text)
        json_result = json.dumps(__mentions_to_dict_list(linked_mentions))
    except:
        print 'except'
    print json_result + '\n'
    return json_result
Exemplo n.º 37
0
def __merge_mentions(mention_file_list, dst_result_file):
    mention_spans_docs = dict()
    fout = open(dst_result_file, 'wb')
    mention_id = 1
    for mention_file in mention_file_list:
        mentions = Mention.load_edl_file(mention_file)
        for m in mentions:
            mention_span = (m.beg_pos, m.end_pos)
            mention_spans = mention_spans_docs.get(m.docid, set())
            if not mention_spans:
                mention_spans_docs[m.docid] = mention_spans

            if mention_span in mention_spans:
                continue

            mention_spans.add(mention_span)
            m.mention_id = 'EDL_%07d' % mention_id
            # if m.entity_type.startswith('PER'):
            #     m.entity_type = 'PER'
            m.to_edl_file(fout)

            mention_id += 1

    fout.close()
Exemplo n.º 38
0
def all_to_all(edl_file, dst_edl_file):
    mentions = Mention.load_edl_file(edl_file)
    __assgin_different_id_to_all_nils(mentions)
    Mention.save_as_edl_file(mentions, dst_edl_file)
Exemplo n.º 39
0
 def from_text(cls, text):
     mentions = [
         Mention.from_text(mention_text.strip())
         for mention_text in text.split(' :: ')
     ]
     return cls(mentions)
Exemplo n.º 40
0
 def produce_mention(self, serif_doc, serif_mention):
     mention = Mention(serif_mention.entity_type, serif_mention.mention_type.value, serif_mention.text, serif_mention.head.text, serif_doc.docid, serif_mention.syn_node.start_char, serif_mention.syn_node.end_char, serif_mention.head.start_char, serif_mention.head.end_char, serif_mention.sent_no)
     return mention
Exemplo n.º 41
0
def __name_expansion(edl_mentions_file, doc_ner_file, tokenized_text_file,
                     entity_candidates_dict_file, dst_file):
    mentions = Mention.load_edl_file(edl_mentions_file)
    __expand_name_with_ner_result(mentions, doc_ner_file)
    # __expand_location_names(mentions, tokenized_text_file, entity_candidates_dict_file)
    Mention.save_as_edl_file(mentions, dst_file)
Exemplo n.º 42
0
def __el_stat():
    data_file = 'e:/data/emadr/el/tac/2009/eval/el-2009-eval-expansion-nloc-3.bin'
    gold_file = 'e:/data/el/LDC2015E19/data/2009/eval/data/mentions-raw.tab'
    # data_file = 'e:/data/emadr/el/tac/2011/eval/el-2011-eval-expansion-all-3.bin'
    # gold_file = 'e:/data/el/LDC2015E19/data/2011/eval/data/mentions-expansion-all.tab'
    # data_file = 'e:/data/emadr/el/tac/2014/eval/el-2014-eval-raw-%d.bin' % 3
    # gold_file = 'e:/data/el/LDC2015E20/data/eval/data/mentions-raw.tab'
    eid_wid_file = 'e:/data/el/res/eid_wid_ord_eid.txt'
    keep_nil = True
    only_show_not_in_candidate = False

    eid_wid_dict = load_eid_wid_file(eid_wid_file)

    # gold_el_result = load_gold_el(gold_file)
    mentions = Mention.load_edl_file(gold_file)
    qid_mention_dict = Mention.group_mentions_by_qid(mentions)
    docs_info, dim = load_docs_info(data_file)

    error_list = list()
    num_mentions, nil_mentions = 0, 0
    nil_hit_cnt, id_hit_cnt = 0, 0
    for doc in docs_info:
        docid, docvec, mentions = doc
        for mention in mentions:
            (qid, kbids, commonnesses, vecs) = mention

            gold_mention = qid_mention_dict[qid]
            gold_id = gold_mention.kbid
            gold_id_is_nil = gold_id.startswith('NIL')
            if gold_id_is_nil:
                nil_mentions += 1
            if not keep_nil and gold_id_is_nil:
                continue
            num_mentions += 1

            indices, legal_kbids = __get_legal_kbids(kbids, keep_nil)

            if gold_id_is_nil and (len(legal_kbids) == 0
                                   or legal_kbids[0].startswith('m.')):
                nil_hit_cnt += 1
                continue

            first_kbid = legal_kbids[0] if legal_kbids else 'NIL'

            if first_kbid == gold_id:
                id_hit_cnt += 1
                continue

            error_list.append(
                (qid, docid, gold_mention.name, gold_id, legal_kbids))

    error_list.sort(key=lambda x: x[2])
    for e in error_list:
        qid, docid, name, gold_id, legal_kbids = e
        gold_wid = eid_wid_dict.get(gold_id, -1)
        in_candidates = gold_id in legal_kbids

        if only_show_not_in_candidate and in_candidates:
            continue

        # if not in_candidates:
        #     print 'not found'
        print '%s\t%s\t%s\t%s_%d' % (qid, docid, name, gold_id, gold_wid)

        # for eid in legal_kbids:
        #     wid = eid_wid_dict.get(eid, -1)
        #     print '\t%s_%d' % (eid, wid),
        # print

    print id_hit_cnt, num_mentions
    print 'INKB: %f' % (float(id_hit_cnt) / (num_mentions - nil_mentions))
    print 'TOTAL: %f' % (float(id_hit_cnt + nil_hit_cnt) / num_mentions)