Exemplo n.º 1
0
def translate_coref(infile, outfile, translated):
    with open(infile, 'r') as f:
        soup = bs4.BeautifulSoup(f, 'xml')

    total = 0
    skipped = 0
    for mrk in soup.find_all('markable'):
        total += 1

        span_parts = []
        for in_span in mrk['span'].split(','):
            from_idx, to_idx = mmax.parse_span(in_span)
            if from_idx not in translated:
                print('Unaligned start word: ' + str(mrk), file=sys.stderr)
                skipped += 1
                continue

            if from_idx == to_idx:
                span = 'word_%d' % (translated[from_idx] + 1)
            elif to_idx - 1 not in translated:
                print('Unaligned end word: ' + str(mrk), file=sys.stderr)
                skipped += 1
                continue
            else:
                span = 'word_%d..word_%d' % tuple(
                    translated[i] + 1 for i in [from_idx, to_idx - 1])

            span_parts.append(span)

        mrk['span'] = ','.join(span_parts)

    print('Skipped %d out of %d markables.' % (skipped, total),
          file=sys.stderr)
    with open(outfile, 'w') as f:
        print(soup.prettify(), file=f)
Exemplo n.º 2
0
def get_coref_chain_boundaries(mmax_dir, mmax_id):
    with open(mmax.sentences_file(mmax_dir, mmax_id), 'r') as f:
        s_soup = bs4.BeautifulSoup(f, 'xml')

    sentence_id = {}
    for mrk in s_soup.find_all('markable'):
        for i in range(*mmax.parse_span(mrk['span'])):
            sentence_id[i] = mrk['orderid']

    with open(mmax.coref_file(mmax_dir, mmax_id), 'r') as f:
        soup = bs4.BeautifulSoup(f, 'xml')

    directory = {'__next__': 1}
    boundaries = {}
    clause_or_vp = set()
    for mrk in soup.find_all('markable'):
        if not mrk.has_attr('coref_class') or not mrk['coref_class'] or mrk['coref_class'] == 'empty':
            continue

        chain_idx = lookup_chain(directory, mrk['coref_class'])

        if mrk['mention'] in ('clause', 'vp'):
            clause_or_vp.add(chain_idx)

        for s in mrk['span'].split(','):
            start, end = mmax.parse_span(s)
            if start == end - 1:
                append(boundaries, start, ('(%d)', chain_idx))
            else:
                if sentence_id[start] != sentence_id[end - 1]:
                    print('%s: Skipped cross-sentence mention (%d): %s' % (mmax_id, end - start, str(mrk)),
                            file=sys.stderr)
                else:
                    append(boundaries, start, ('(%d', chain_idx))
                    append(boundaries, end - 1, ('%d)', chain_idx))

    str_boundaries = {}
    for pos, chains in boundaries.items():
        str_chains = [fmt % idx for fmt, idx in chains if idx not in clause_or_vp]
        if len(str_chains):
            str_boundaries[pos] = '|'.join(str_chains)
        else:
            str_boundaries[pos] = '-'

    return str_boundaries
Exemplo n.º 3
0
def compare_mmax(dir1, dir2, mmax_dir, mmax_id):
    mmax_dir1 = os.path.join(dir1, mmax_dir)
    mmax_dir2 = os.path.join(dir2, mmax_dir)

    with open(mmax.words_file(mmax_dir1, mmax_id), 'r') as f:
        words1 = [
            w.string for w in bs4.BeautifulSoup(f, 'xml').find_all('word')
        ]
    with open(mmax.words_file(mmax_dir2, mmax_id), 'r') as f:
        words2 = [
            w.string for w in bs4.BeautifulSoup(f, 'xml').find_all('word')
        ]

    to_check = []

    for level in ['sentence', 'coref']:
        fname1 = '%s/Markables/%s_%s_level.xml' % (mmax_dir1, mmax_id, level)
        fname2 = '%s/Markables/%s_%s_level.xml' % (mmax_dir2, mmax_id, level)

        with open(fname1, 'r') as f:
            markables1 = [
                m for m in bs4.BeautifulSoup(f, 'xml').find_all('markable')
            ]
        with open(fname2, 'r') as f:
            markables2 = [
                m for m in bs4.BeautifulSoup(f, 'xml').find_all('markable')
            ]

        def sort_key(mrk):
            return [mmax.parse_span(m) for m in mrk['span'].split(',')]

        markables1.sort(key=sort_key)
        markables2.sort(key=sort_key)

        if len(markables1) != len(markables2):
            print('%s/%s: Number of markables does not match (%d != %d)' %
                  (mmax_dir, mmax_id, len(markables1), len(markables2)),
                  file=sys.stderr)
            continue

        for mrk1, mrk2 in zip(markables1, markables2):
            spans1 = mrk1['span'].split(',')
            spans2 = mrk2['span'].split(',')
            if len(spans1) != len(spans2):
                print(
                    '%s/%s: Number of span components does not match (%s / %s)'
                    % (mmax_dir, mmax_id, mrk1['span'], mrk2['span']),
                    file=sys.stderr)
                continue

            for sp1, sp2 in zip(spans1, spans2):
                s1, e1 = mmax.parse_span(sp1)
                s2, e2 = mmax.parse_span(sp2)
                txt1 = ''.join(words1[s1:e1])
                txt2 = ''.join(words2[s2:e2])

                # Quotes are changed by the tokeniser
                txt1 = txt1.replace("``", '"').replace("''", '"')
                txt2 = txt2.replace("``", '"').replace("''", '"')

                if txt1 != txt2:
                    to_check.append((level, sp2))

    if to_check:
        print(mmax_dir, mmax_id, file=sys.stderr)
        create_checks_level(mmax_dir2, mmax_id, to_check)
Exemplo n.º 4
0
 def sort_key(mrk):
     return [mmax.parse_span(m) for m in mrk['span'].split(',')]