Пример #1
0
    def _postproc_sent(sent):
        nonlocal lc_tbd

        assert 'sent_id' in sent, sent

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1 based on first token offset
        xmwes = [(e["toknums"][0], 's', mwenum)
                 for mwenum, e in sent['smwes'].items()]
        xmwes += [(e["toknums"][0], 'w', mwenum)
                  for mwenum, e in sent['wmwes'].items()]
        xmwes.sort()
        for k, mwe in chain(sent['smwes'].items(), sent['wmwes'].items()):
            assert xmwes[int(k) - 1][
                2] == k, f"In {sent['sent_id']}, MWEs are not numbered in the correct order: use normalize_mwe_numbering.py to fix"

        # check that lexical & weak MWE lemmas are correct
        lexes_to_validate = chain(
            sent['swes'].values(),
            sent['smwes'].values()) if validate_type else []
        for lexe in lexes_to_validate:
            sent['toks'][lexe['toknums'][0] - 1]
            assert lexe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma'] for i in lexe['toknums']
            ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            if lc == 'V':
                assert len(
                    lexe['toknums']
                ) == 1, f'In {sent["sent_id"]}, Verbal MWE "{lexe["lexlemma"]}" lexcat must be subtyped (V.VID, etc., not V)'
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in (
                        'N', 'V') or lc.startswith('V.')) != (ss2 is None) or (
                            ss2 is not None and ss2 not in valid_ss):
                    assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (f'{ss2} should never be function', lexe)
                    if ss != ss2:
                        ssA, ss2A = ancestors(ss), ancestors(ss2)
                        # there are just a few permissible combinations where one is the ancestor of the other
                        if (ss, ss2) not in {('p.Whole', 'p.Gestalt'),
                                             ('p.Goal', 'p.Locus'),
                                             ('p.Circumstance', 'p.Locus'),
                                             ('p.Circumstance', 'p.Path'),
                                             ('p.Locus', 'p.Goal'),
                                             ('p.Locus', 'p.Source'),
                                             ('p.Characteristic', 'p.Stuff')}:
                            assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
                            assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            if lc not in ALL_LEXCATS:
                assert not validate_type, f"In {sent['sent_id']}, invalid lexcat {lc} for single-word expression '{tok['word']}'"
                continue
            if validate_pos and upos != lc and lc != 'PP' and (
                    upos, lc) not in {('NOUN', 'N'), ('PROPN', 'N'),
                                      ('VERB', 'V'), ('ADP', 'P'),
                                      ('ADV', 'P'), ('SCONJ', 'P'),
                                      ('ADP', 'DISC'), ('ADV', 'DISC'),
                                      ('SCONJ', 'DISC'), ('PART', 'POSS')}:
                # most often, the single-word lexcat should match its upos
                # check a list of exceptions
                mismatchOK = False
                if xpos == 'TO' and lc.startswith('INF'):
                    mismatchOK = True
                elif (xpos == 'TO') != lc.startswith('INF'):
                    assert upos == 'SCONJ' and swe['lexlemma'] == 'for', (
                        sent['sent_id'], swe, tok)
                    mismatchOK = True

                if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                    try:
                        assert upos in ('SYM', 'X') or (lc in (
                            'PRON', 'DISC')), (sent['sent_id'], swe, tok)
                    except AssertionError:
                        print('Suspicious lexcat/POS combination:',
                              sent['sent_id'],
                              swe,
                              tok,
                              file=sys.stderr)
                    mismatchOK = True
                message = f"In {sent['sent_id']}, single-word expression '{tok['word']}' has lexcat {lc}, which is incompatible with its upos {upos}"
                if (upos == 'AUX') != (lc == 'AUX'):
                    assert tok[
                        'lemma'] == 'be' and lc == 'V', message  # copula has upos=AUX
                    mismatchOK = True
                if (upos == 'VERB') != (lc == 'V'):
                    if lc == 'ADJ':
                        print(
                            'Word treated as VERB in UD, ADJ for supersenses:',
                            sent['sent_id'],
                            tok['word'],
                            file=sys.stderr)
                    else:
                        assert tok[
                            'lemma'] == 'be' and lc == 'V', message  # copula has upos=AUX
                    mismatchOK = True
                if upos == 'PRON':
                    assert lc == 'PRON' or lc == 'PRON.POSS', message
                    mismatchOK = True
                if lc == 'ADV':
                    assert upos == 'ADV' or upos == 'PART', message  # PART is for negations
                    mismatchOK = True
                if upos == 'ADP' and lc == 'CCONJ':
                    assert tok['lemma'] == 'versus'
                    mismatchOK = True

                assert mismatchOK, message
            if validate_type:
                assert lc != 'PP', f"In {sent['sent_id']}, PP should only apply to strong MWEs, but occurs for single-word expression '{tok['word']}'"
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(
                wmwe['toknums']
            ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
            assert wmwe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma']
                for i in wmwe['toknums']), (wmwe,
                                            sent['toks'][wmwe['toknums'][0] -
                                                         1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        if 'mwe' not in sent:
            sent['mwe'] = render_sent(sent, False, False)
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            else:
                position = None
                lexe = sent['swes'][tok['#']]

            if position is None or position == 1:
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            assert tok[
                'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print(f'MWE string mismatch{caveat}:',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)
Пример #2
0
    def _postproc_sent(sent):
        nonlocal lc_tbd

        sent['autoid_swes'] = sent.get('autoid_swes') or {}
        sent['autoid_smwes'] = sent.get('autoid_smwes') or {}

        # autoid/goldid - pick one according to args. For autoid, fill in gold ss,ss2 if there's an exact match in gold id
        if identification == 'autoid':
            for auto_we in chain(sent['autoid_swes'].values(),
                                 sent['autoid_smwes'].values()):
                matching_gold_wes = [
                    we for we in chain(sent['swes'].values(),
                                       sent['smwes'].values())
                    if set(we['toknums']) == set(auto_we['toknums'])
                ]
                gold_we = (matching_gold_wes + [None])[0]
                if gold_we and all([
                        ss is None or '.' in ss
                        for ss in [gold_we['ss'], gold_we['ss2']]
                ]):
                    auto_we['ss'], auto_we['ss2'] = gold_we['ss'], gold_we[
                        'ss2']
                else:
                    auto_we['ss'], auto_we['ss2'] = None, None
            sent['swes'], sent['smwes'] = sent['autoid_swes'], sent[
                'autoid_smwes']
            for tok in sent['toks']:
                tok['smwe'] = tok.get('autoid_smwe')
                if 'autoid_smwe' in tok:
                    del tok['autoid_smwe']
                tok['wmwe'] = None
            sent['wmwes'] = {}

        del sent['autoid_smwes']
        del sent['autoid_swes']

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1
        # fix_mwe_numbering.py was written to correct this
        for i, (k, mwe) in enumerate(
                sorted(chain(sent['smwes'].items(), sent['wmwes'].items()),
                       key=lambda x: int(x[0])), 1):
            assert int(k) == i, (sent['sent_id'], i, k, mwe)

        # check that lexical & weak MWE lemmas are correct
        for lexe in chain(sent['swes'].values(), sent['smwes'].values()):
            lexe['lexlemma'] = ' '.join(sent['toks'][i - 1]['lemma']
                                        for i in lexe['toknums'])
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in ('N', 'V')) != (
                        ss2 is None) or (ss2 is not None
                                         and ss2 not in valid_ss):
                    print('Invalid supersense(s) in lexical entry:',
                          lexe,
                          file=sys.stderr)
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (ss2 + ' should never be function', lexe)
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            assert lc in ALL_LEXCATS, (sent['sent_id'], tok)
            if (xpos == 'TO') != lc.startswith('INF'):
                # assert upos=='SCONJ' and swe['lexlemma']=='for',(sent['sent_id'],swe,tok)
                pass
            if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                try:
                    assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), (
                        sent['sent_id'], swe, tok)
                except AssertionError:
                    print('Suspicious lexcat/POS combination:',
                          sent['sent_id'],
                          swe,
                          tok,
                          file=sys.stderr)
            if (upos == 'AUX') != (lc == 'AUX'):
                # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok)    # copula has upos=AUX
                pass
            if (upos == 'VERB') != (lc == 'V'):
                if lc == 'ADJ':
                    print('Word treated as VERB in UD, ADJ for supersenses:',
                          sent['sent_id'],
                          tok['word'],
                          file=sys.stderr)
                else:
                    # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok)    # copula has upos=AUX
                    pass
            if upos == 'PRON':
                # assert lc=='PRON' or lc=='PRON.POSS',(sent['sent_id'],tok)
                pass
            if lc == 'ADV':
                # assert upos=='ADV' or upos=='PART',(sent['sent_id'],tok)    # PART is for negations
                pass
            assert lc != 'PP', ('PP should only apply to strong MWEs',
                                sent['sent_id'], tok)
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(wmwe['toknums']) > 1, (sent['sent_id'], wmwe)
            # assert wmwe['lexlemma']==' '.join(sent['toks'][i-1]['lemma'] for i in wmwe['toknums']),(wmwe,sent['toks'][wmwe['toknums'][0]-1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            elif tok['#'] in sent['swes']:
                position = None
                lexe = sent['swes'][tok['#']]
            else:
                lexe = None

            if lexe and (position is None or position == 1):
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            # assert tok['lextag']==fulllextag,(sent['sent_id'],fulllextag,tok)

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print('MWE string mismatch' + caveat + ':',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)
Пример #3
0
def main(args):
    if args.colorless or not sys.stdin.isatty():
        for c in dir(Colors):
            if not c.startswith('_'):
                setattr(Colors, c, '')
        for s in dir(Styles):
            if not s.startswith('_'):
                setattr(Styles, s, '')


    goldF = args.goldfile
    sysFs = args.sysfile

    ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith('p.') else ss

    # Load gold data
    gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper))

    predFs = [load_sents(predFP, ss_mapper=ss_mapper) for predFP in sysFs]

    all_sys_scores = {}

    def filter_labels(ll):
        result = dict(ll)
        for k,l in ll.items():
            if l.startswith('n.') and args.no_noun: del result[k]
            elif l.startswith('v.') and args.no_verb: del result[k]
            elif l.startswith('p.') and args.no_snacs: del result[k]
        return result

    R = lambda ww,sg,wg,ll: render(ww, sg if not args.no_mwe else [], wg if not args.no_mwe else [], filter_labels(ll))

    for i,sent in enumerate(gold_sents):
        # gold analysis
        words = [t["word"] for t in sent["toks"]]
        rendered = []
        rendered.append(R(words,
                           [e["toknums"] for e in sent["smwes"].values()],
                           [e["toknums"] for e in sent["wmwes"].values()],
                           makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True)))
        for predF in predFs:
            psent = next(predF)
            assert psent['sent_id']==sent['sent_id']
            rendered.append(R(words,
                               [e["toknums"] for e in psent["smwes"].values()],
                               [e["toknums"] for e in psent["wmwes"].values()],
                               makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True)))

        diff_classes = set()
        if not args.no_diff:
            diff_classes.add('special')
            if not args.no_mwe_diff: diff_classes.add('mwe')
            if not args.no_noun_diff: diff_classes.add('n')
            if not args.no_snacs_diff: diff_classes.add('p')
            if not args.no_verb_diff: diff_classes.add('v')

        if args.sent_ids:
            print(sent['sent_id'], end='\t')
        print(color_rendered(words, rendered, diff_classes))
        #assert False,(color_rendered(words, rendered),words,rendered)

    # restore the terminal's default colors
    print(Colors.ENDC, end='')
Пример #4
0
    def _postproc_sent(sent):
        nonlocal lc_tbd

        # check that tokens are numbered from 1, in order
        for i, tok in enumerate(sent['toks'], 1):
            assert tok['#'] == i

        # check that MWEs are numbered from 1
        # fix_mwe_numbering.py was written to correct this
        for i, (k, mwe) in enumerate(
                sorted(chain(sent['smwes'].items(), sent['wmwes'].items()),
                       key=lambda x: int(x[0])), 1):
            assert int(k) == i, (sent['sent_id'], i, k, mwe)

        # check that lexical & weak MWE lemmas are correct
        for lexe in chain(sent['swes'].values(), sent['smwes'].values()):
            assert lexe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma'] for i in lexe['toknums']
            ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
            lc = lexe['lexcat']
            if lc.endswith('!@'): lc_tbd += 1
            valid_ss = supersenses_for_lexcat(lc)
            if lc == 'V':
                assert len(
                    lexe['toknums']
                ) == 1, f'Verbal MWE lexcat must be subtyped (V.VID, etc., not V): {lexe}'
            ss, ss2 = lexe['ss'], lexe['ss2']
            if valid_ss:
                if ss == '??':
                    assert ss2 is None
                elif ss not in valid_ss or (lc in (
                        'N', 'V') or lc.startswith('V.')) != (ss2 is None) or (
                            ss2 is not None and ss2 not in valid_ss):
                    assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
                elif ss.startswith('p.'):
                    assert ss2.startswith('p.')
                    assert ss2 not in {
                        'p.Experiencer', 'p.Stimulus', 'p.Originator',
                        'p.Recipient', 'p.SocialRel', 'p.OrgRole'
                    }, (f'{ss2} should never be function', lexe)
                    if ss != ss2:
                        ssA, ss2A = ancestors(ss), ancestors(ss2)
                        # there are just a few permissible combinations where one is the ancestor of the other
                        if (ss, ss2) not in {('p.Whole', 'p.Gestalt'),
                                             ('p.Goal', 'p.Locus'),
                                             ('p.Circumstance', 'p.Locus'),
                                             ('p.Circumstance', 'p.Path'),
                                             ('p.Locus', 'p.Goal'),
                                             ('p.Locus', 'p.Source'),
                                             ('p.Characteristic', 'p.Stuff')}:
                            assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
                            assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
            else:
                assert ss is None and ss2 is None and lexe not in (
                    'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe

        # check lexcat on single-word expressions
        for swe in sent['swes'].values():
            tok = sent['toks'][swe['toknums'][0] - 1]
            upos, xpos = tok['upos'], tok['xpos']
            lc = swe['lexcat']
            if lc.endswith('!@'): continue
            assert lc in ALL_LEXCATS, f"In {sent['sent_id']}, invalid lexcat for single-word expression: {lc} in {tok}"
            if (xpos == 'TO') != lc.startswith('INF'):
                assert upos == 'SCONJ' and swe['lexlemma'] == 'for', (
                    sent['sent_id'], swe, tok)
            if (upos in ('NOUN', 'PROPN')) != (lc == 'N'):
                try:
                    assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), (
                        sent['sent_id'], swe, tok)
                except AssertionError:
                    print('Suspicious lexcat/POS combination:',
                          sent['sent_id'],
                          swe,
                          tok,
                          file=sys.stderr)
            if (upos == 'AUX') != (lc == 'AUX'):
                assert tok['lemma'] == 'be' and lc == 'V', (
                    sent['sent_id'], tok)  # copula has upos=AUX
            if (upos == 'VERB') != (lc == 'V'):
                if lc == 'ADJ':
                    print('Word treated as VERB in UD, ADJ for supersenses:',
                          sent['sent_id'],
                          tok['word'],
                          file=sys.stderr)
                else:
                    assert tok['lemma'] == 'be' and lc == 'V', (
                        sent['sent_id'], tok)  # copula has upos=AUX
            if upos == 'PRON':
                assert lc == 'PRON' or lc == 'PRON.POSS', (sent['sent_id'],
                                                           tok)
            if lc == 'ADV':
                assert upos == 'ADV' or upos == 'PART', (
                    sent['sent_id'], tok)  # PART is for negations
            assert lc != 'PP', ('PP should only apply to strong MWEs',
                                sent['sent_id'], tok)
        for smwe in sent['smwes'].values():
            assert len(smwe['toknums']) > 1
        for wmwe in sent['wmwes'].values():
            assert len(
                wmwe['toknums']
            ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
            assert wmwe['lexlemma'] == ' '.join(
                sent['toks'][i - 1]['lemma']
                for i in wmwe['toknums']), (wmwe,
                                            sent['toks'][wmwe['toknums'][0] -
                                                         1])
        # we already checked that noninitial tokens in an MWE have _ as their lemma

        # check lextags
        smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
        wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
        tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups,
                            wmweGroups)
        for tok, tag in zip(sent['toks'], tagging):
            fulllextag = tag
            if tok['smwe']:
                smweNum, position = tok['smwe']
                lexe = sent['smwes'][smweNum]
            else:
                position = None
                lexe = sent['swes'][tok['#']]

            if position is None or position == 1:
                lexcat = lexe['lexcat']
                fulllextag += '-' + lexcat
                ss1, ss2 = lexe['ss'], lexe['ss2']
                if ss1 is not None:
                    assert ss1
                    fulllextag += '-' + ss1
                    if ss2 is not None and ss2 != ss1:
                        assert ss2
                        fulllextag += '|' + ss2
                if tok['wmwe']:
                    wmweNum, position = tok['wmwe']
                    wmwe = sent['wmwes'][wmweNum]
                    wcat = wmwe['lexcat']
                    if wcat and position == 1:
                        fulllextag += '+' + wcat

            assert tok[
                'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"

        # check rendered MWE string
        s = render([tok['word'] for tok in sent['toks']], smweGroups,
                   wmweGroups)
        if sent['mwe'] != s:
            caveat = ' (may be due to simplification)' if '$1' in sent[
                'mwe'] else ''
            print(f'MWE string mismatch{caveat}:',
                  s,
                  sent['mwe'],
                  sent['sent_id'],
                  file=sys.stderr)