Пример #1
0
    def _postproc_sent(sent):
        nonlocal lc_tbd

        assert 'sent_id' in sent, sent

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1 based on first token offset
        xmwes = [(e["toknums"][0], 's', mwenum)
                 for mwenum, e in sent['smwes'].items()]
        xmwes += [(e["toknums"][0], 'w', mwenum)
                  for mwenum, e in sent['wmwes'].items()]
        xmwes.sort()
        for k, mwe in chain(sent['smwes'].items(), sent['wmwes'].items()):
            assert xmwes[int(k) - 1][
                2] == k, f"In {sent['sent_id']}, MWEs are not numbered in the correct order: use normalize_mwe_numbering.py to fix"

        # check that lexical & weak MWE lemmas are correct
        lexes_to_validate = chain(
            sent['swes'].values(),
            sent['smwes'].values()) if validate_type else []
        for lexe in lexes_to_validate:
            sent['toks'][lexe['toknums'][0] - 1]
            assert lexe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma'] for i in lexe['toknums']
            ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            if lc == 'V':
                assert len(
                    lexe['toknums']
                ) == 1, f'In {sent["sent_id"]}, Verbal MWE "{lexe["lexlemma"]}" lexcat must be subtyped (V.VID, etc., not V)'
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in (
                        'N', 'V') or lc.startswith('V.')) != (ss2 is None) or (
                            ss2 is not None and ss2 not in valid_ss):
                    assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (f'{ss2} should never be function', lexe)
                    if ss != ss2:
                        ssA, ss2A = ancestors(ss), ancestors(ss2)
                        # there are just a few permissible combinations where one is the ancestor of the other
                        if (ss, ss2) not in {('p.Whole', 'p.Gestalt'),
                                             ('p.Goal', 'p.Locus'),
                                             ('p.Circumstance', 'p.Locus'),
                                             ('p.Circumstance', 'p.Path'),
                                             ('p.Locus', 'p.Goal'),
                                             ('p.Locus', 'p.Source'),
                                             ('p.Characteristic', 'p.Stuff')}:
                            assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
                            assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            if lc not in ALL_LEXCATS:
                assert not validate_type, f"In {sent['sent_id']}, invalid lexcat {lc} for single-word expression '{tok['word']}'"
                continue
            if validate_pos and upos != lc and lc != 'PP' and (
                    upos, lc) not in {('NOUN', 'N'), ('PROPN', 'N'),
                                      ('VERB', 'V'), ('ADP', 'P'),
                                      ('ADV', 'P'), ('SCONJ', 'P'),
                                      ('ADP', 'DISC'), ('ADV', 'DISC'),
                                      ('SCONJ', 'DISC'), ('PART', 'POSS')}:
                # most often, the single-word lexcat should match its upos
                # check a list of exceptions
                mismatchOK = False
                if xpos == 'TO' and lc.startswith('INF'):
                    mismatchOK = True
                elif (xpos == 'TO') != lc.startswith('INF'):
                    assert upos == 'SCONJ' and swe['lexlemma'] == 'for', (
                        sent['sent_id'], swe, tok)
                    mismatchOK = True

                if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                    try:
                        assert upos in ('SYM', 'X') or (lc in (
                            'PRON', 'DISC')), (sent['sent_id'], swe, tok)
                    except AssertionError:
                        print('Suspicious lexcat/POS combination:',
                              sent['sent_id'],
                              swe,
                              tok,
                              file=sys.stderr)
                    mismatchOK = True
                message = f"In {sent['sent_id']}, single-word expression '{tok['word']}' has lexcat {lc}, which is incompatible with its upos {upos}"
                if (upos == 'AUX') != (lc == 'AUX'):
                    assert tok[
                        'lemma'] == 'be' and lc == 'V', message  # copula has upos=AUX
                    mismatchOK = True
                if (upos == 'VERB') != (lc == 'V'):
                    if lc == 'ADJ':
                        print(
                            'Word treated as VERB in UD, ADJ for supersenses:',
                            sent['sent_id'],
                            tok['word'],
                            file=sys.stderr)
                    else:
                        assert tok[
                            'lemma'] == 'be' and lc == 'V', message  # copula has upos=AUX
                    mismatchOK = True
                if upos == 'PRON':
                    assert lc == 'PRON' or lc == 'PRON.POSS', message
                    mismatchOK = True
                if lc == 'ADV':
                    assert upos == 'ADV' or upos == 'PART', message  # PART is for negations
                    mismatchOK = True
                if upos == 'ADP' and lc == 'CCONJ':
                    assert tok['lemma'] == 'versus'
                    mismatchOK = True

                assert mismatchOK, message
            if validate_type:
                assert lc != 'PP', f"In {sent['sent_id']}, PP should only apply to strong MWEs, but occurs for single-word expression '{tok['word']}'"
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(
                wmwe['toknums']
            ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
            assert wmwe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma']
                for i in wmwe['toknums']), (wmwe,
                                            sent['toks'][wmwe['toknums'][0] -
                                                         1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        if 'mwe' not in sent:
            sent['mwe'] = render_sent(sent, False, False)
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            else:
                position = None
                lexe = sent['swes'][tok['#']]

            if position is None or position == 1:
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            assert tok[
                'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print(f'MWE string mismatch{caveat}:',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)
Пример #2
0
    def _postproc_sent(sent):
        nonlocal lc_tbd

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1
        # fix_mwe_numbering.py was written to correct this
        for i, (k, mwe) in enumerate(
                sorted(chain(sent['smwes'].items(), sent['wmwes'].items()),
                       key=lambda x: int(x[0])), 1):
            assert int(k) == i, (sent['sent_id'], i, k, mwe)

        # check that lexical & weak MWE lemmas are correct
        for lexe in chain(sent['swes'].values(), sent['smwes'].values()):
            assert lexe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma'] for i in lexe['toknums']
            ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            if lc == 'V':
                assert len(
                    lexe['toknums']
                ) == 1, f'Verbal MWE lexcat must be subtyped (V.VID, etc., not V): {lexe}'
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in (
                        'N', 'V') or lc.startswith('V.')) != (ss2 is None) or (
                            ss2 is not None and ss2 not in valid_ss):
                    assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (f'{ss2} should never be function', lexe)
                    if ss != ss2:
                        ssA, ss2A = ancestors(ss), ancestors(ss2)
                        # there are just a few permissible combinations where one is the ancestor of the other
                        if (ss, ss2) not in {('p.Whole', 'p.Gestalt'),
                                             ('p.Goal', 'p.Locus'),
                                             ('p.Circumstance', 'p.Locus'),
                                             ('p.Circumstance', 'p.Path'),
                                             ('p.Locus', 'p.Goal'),
                                             ('p.Locus', 'p.Source'),
                                             ('p.Characteristic', 'p.Stuff')}:
                            assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
                            assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            assert lc in ALL_LEXCATS, f"In {sent['sent_id']}, invalid lexcat for single-word expression: {lc} in {tok}"
            if (xpos == 'TO') != lc.startswith('INF'):
                assert upos == 'SCONJ' and swe['lexlemma'] == 'for', (
                    sent['sent_id'], swe, tok)
            if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                try:
                    assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), (
                        sent['sent_id'], swe, tok)
                except AssertionError:
                    print('Suspicious lexcat/POS combination:',
                          sent['sent_id'],
                          swe,
                          tok,
                          file=sys.stderr)
            if (upos == 'AUX') != (lc == 'AUX'):
                assert tok['lemma'] == 'be' and lc == 'V', (
                    sent['sent_id'], tok)  # copula has upos=AUX
            if (upos == 'VERB') != (lc == 'V'):
                if lc == 'ADJ':
                    print('Word treated as VERB in UD, ADJ for supersenses:',
                          sent['sent_id'],
                          tok['word'],
                          file=sys.stderr)
                else:
                    assert tok['lemma'] == 'be' and lc == 'V', (
                        sent['sent_id'], tok)  # copula has upos=AUX
            if upos == 'PRON':
                assert lc == 'PRON' or lc == 'PRON.POSS', (sent['sent_id'],
                                                           tok)
            if lc == 'ADV':
                assert upos == 'ADV' or upos == 'PART', (
                    sent['sent_id'], tok)  # PART is for negations
            assert lc != 'PP', ('PP should only apply to strong MWEs',
                                sent['sent_id'], tok)
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(
                wmwe['toknums']
            ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
            assert wmwe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma']
                for i in wmwe['toknums']), (wmwe,
                                            sent['toks'][wmwe['toknums'][0] -
                                                         1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            else:
                position = None
                lexe = sent['swes'][tok['#']]

            if position is None or position == 1:
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            assert tok[
                'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print(f'MWE string mismatch{caveat}:',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)