def _postproc_sent(sent): nonlocal lc_tbd assert 'sent_id' in sent, sent # check that tokens are numbered from 1, in order for i, tok in enumerate(sent['toks'], 1): assert tok['#'] == i # check that MWEs are numbered from 1 based on first token offset xmwes = [(e["toknums"][0], 's', mwenum) for mwenum, e in sent['smwes'].items()] xmwes += [(e["toknums"][0], 'w', mwenum) for mwenum, e in sent['wmwes'].items()] xmwes.sort() for k, mwe in chain(sent['smwes'].items(), sent['wmwes'].items()): assert xmwes[int(k) - 1][ 2] == k, f"In {sent['sent_id']}, MWEs are not numbered in the correct order: use normalize_mwe_numbering.py to fix" # check that lexical & weak MWE lemmas are correct lexes_to_validate = chain( sent['swes'].values(), sent['smwes'].values()) if validate_type else [] for lexe in lexes_to_validate: sent['toks'][lexe['toknums'][0] - 1] assert lexe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in lexe['toknums'] ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}" lc = lexe['lexcat'] if lc.endswith('!@'): lc_tbd += 1 valid_ss = supersenses_for_lexcat(lc) if lc == 'V': assert len( lexe['toknums'] ) == 1, f'In {sent["sent_id"]}, Verbal MWE "{lexe["lexlemma"]}" lexcat must be subtyped (V.VID, etc., not V)' ss, ss2 = lexe['ss'], lexe['ss2'] if valid_ss: if ss == '??': assert ss2 is None elif ss not in valid_ss or (lc in ( 'N', 'V') or lc.startswith('V.')) != (ss2 is None) or ( ss2 is not None and ss2 not in valid_ss): assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}" elif ss.startswith('p.'): assert ss2.startswith('p.') assert ss2 not in { 'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.OrgRole' }, (f'{ss2} should never be function', lexe) if ss != ss2: ssA, ss2A = ancestors(ss), ancestors(ss2) # there are just a few permissible combinations where one is the ancestor of the other if (ss, ss2) not in {('p.Whole', 'p.Gestalt'), ('p.Goal', 'p.Locus'), ('p.Circumstance', 'p.Locus'), ('p.Circumstance', 'p.Path'), ('p.Locus', 'p.Goal'), ('p.Locus', 'p.Source'), ('p.Characteristic', 'p.Stuff')}: assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" else: assert ss is None and ss2 is None and lexe not in ( 'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe # check lexcat on single-word expressions for swe in sent['swes'].values(): tok = sent['toks'][swe['toknums'][0] - 1] upos, xpos = tok['upos'], tok['xpos'] lc = swe['lexcat'] if lc.endswith('!@'): continue if lc not in ALL_LEXCATS: assert not validate_type, f"In {sent['sent_id']}, invalid lexcat {lc} for single-word expression '{tok['word']}'" continue if validate_pos and upos != lc and lc != 'PP' and ( upos, lc) not in {('NOUN', 'N'), ('PROPN', 'N'), ('VERB', 'V'), ('ADP', 'P'), ('ADV', 'P'), ('SCONJ', 'P'), ('ADP', 'DISC'), ('ADV', 'DISC'), ('SCONJ', 'DISC'), ('PART', 'POSS')}: # most often, the single-word lexcat should match its upos # check a list of exceptions mismatchOK = False if xpos == 'TO' and lc.startswith('INF'): mismatchOK = True elif (xpos == 'TO') != lc.startswith('INF'): assert upos == 'SCONJ' and swe['lexlemma'] == 'for', ( sent['sent_id'], swe, tok) mismatchOK = True if (upos in ('NOUN', 'PROPN')) != (lc == 'N'): try: assert upos in ('SYM', 'X') or (lc in ( 'PRON', 'DISC')), (sent['sent_id'], swe, tok) except AssertionError: print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr) mismatchOK = True message = f"In {sent['sent_id']}, single-word expression '{tok['word']}' has lexcat {lc}, which is incompatible with its upos {upos}" if (upos == 'AUX') != (lc == 'AUX'): assert tok[ 'lemma'] == 'be' and lc == 'V', message # copula has upos=AUX mismatchOK = True if (upos == 'VERB') != (lc == 'V'): if lc == 'ADJ': print( 'Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr) else: assert tok[ 'lemma'] == 'be' and lc == 'V', message # copula has upos=AUX mismatchOK = True if upos == 'PRON': assert lc == 'PRON' or lc == 'PRON.POSS', message mismatchOK = True if lc == 'ADV': assert upos == 'ADV' or upos == 'PART', message # PART is for negations mismatchOK = True if upos == 'ADP' and lc == 'CCONJ': assert tok['lemma'] == 'versus' mismatchOK = True assert mismatchOK, message if validate_type: assert lc != 'PP', f"In {sent['sent_id']}, PP should only apply to strong MWEs, but occurs for single-word expression '{tok['word']}'" for smwe in sent['smwes'].values(): assert len(smwe['toknums']) > 1 for wmwe in sent['wmwes'].values(): assert len( wmwe['toknums'] ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}" assert wmwe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in wmwe['toknums']), (wmwe, sent['toks'][wmwe['toknums'][0] - 1]) # we already checked that noninitial tokens in an MWE have _ as their lemma # check lextags smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()] wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()] if 'mwe' not in sent: sent['mwe'] = render_sent(sent, False, False) tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups) for tok, tag in zip(sent['toks'], tagging): fulllextag = tag if tok['smwe']: smweNum, position = tok['smwe'] lexe = sent['smwes'][smweNum] else: position = None lexe = sent['swes'][tok['#']] if position is None or position == 1: lexcat = lexe['lexcat'] fulllextag += '-' + lexcat ss1, ss2 = lexe['ss'], lexe['ss2'] if ss1 is not None: assert ss1 fulllextag += '-' + ss1 if ss2 is not None and ss2 != ss1: assert ss2 fulllextag += '|' + ss2 if tok['wmwe']: wmweNum, position = tok['wmwe'] wmwe = sent['wmwes'][wmweNum] wcat = wmwe['lexcat'] if wcat and position == 1: fulllextag += '+' + wcat assert tok[ 'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}" # check rendered MWE string s = render([tok['word'] for tok in sent['toks']], smweGroups, wmweGroups) if sent['mwe'] != s: caveat = ' (may be due to simplification)' if '$1' in sent[ 'mwe'] else '' print(f'MWE string mismatch{caveat}:', s, sent['mwe'], sent['sent_id'], file=sys.stderr)
def _postproc_sent(sent): nonlocal lc_tbd sent['autoid_swes'] = sent.get('autoid_swes') or {} sent['autoid_smwes'] = sent.get('autoid_smwes') or {} # autoid/goldid - pick one according to args. For autoid, fill in gold ss,ss2 if there's an exact match in gold id if identification == 'autoid': for auto_we in chain(sent['autoid_swes'].values(), sent['autoid_smwes'].values()): matching_gold_wes = [ we for we in chain(sent['swes'].values(), sent['smwes'].values()) if set(we['toknums']) == set(auto_we['toknums']) ] gold_we = (matching_gold_wes + [None])[0] if gold_we and all([ ss is None or '.' in ss for ss in [gold_we['ss'], gold_we['ss2']] ]): auto_we['ss'], auto_we['ss2'] = gold_we['ss'], gold_we[ 'ss2'] else: auto_we['ss'], auto_we['ss2'] = None, None sent['swes'], sent['smwes'] = sent['autoid_swes'], sent[ 'autoid_smwes'] for tok in sent['toks']: tok['smwe'] = tok.get('autoid_smwe') if 'autoid_smwe' in tok: del tok['autoid_smwe'] tok['wmwe'] = None sent['wmwes'] = {} del sent['autoid_smwes'] del sent['autoid_swes'] # check that tokens are numbered from 1, in order for i, tok in enumerate(sent['toks'], 1): assert tok['#'] == i # check that MWEs are numbered from 1 # fix_mwe_numbering.py was written to correct this for i, (k, mwe) in enumerate( sorted(chain(sent['smwes'].items(), sent['wmwes'].items()), key=lambda x: int(x[0])), 1): assert int(k) == i, (sent['sent_id'], i, k, mwe) # check that lexical & weak MWE lemmas are correct for lexe in chain(sent['swes'].values(), sent['smwes'].values()): lexe['lexlemma'] = ' '.join(sent['toks'][i - 1]['lemma'] for i in lexe['toknums']) lc = lexe['lexcat'] if lc.endswith('!@'): lc_tbd += 1 valid_ss = supersenses_for_lexcat(lc) ss, ss2 = lexe['ss'], lexe['ss2'] if valid_ss: if ss == '??': assert ss2 is None elif ss not in valid_ss or (lc in ('N', 'V')) != ( ss2 is None) or (ss2 is not None and ss2 not in valid_ss): print('Invalid supersense(s) in lexical entry:', lexe, file=sys.stderr) elif ss.startswith('p.'): assert ss2.startswith('p.') assert ss2 not in { 'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.OrgRole' }, (ss2 + ' should never be function', lexe) else: assert ss is None and ss2 is None and lexe not in ( 'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe # check lexcat on single-word expressions for swe in sent['swes'].values(): tok = sent['toks'][swe['toknums'][0] - 1] upos, xpos = tok['upos'], tok['xpos'] lc = swe['lexcat'] if lc.endswith('!@'): continue assert lc in ALL_LEXCATS, (sent['sent_id'], tok) if (xpos == 'TO') != lc.startswith('INF'): # assert upos=='SCONJ' and swe['lexlemma']=='for',(sent['sent_id'],swe,tok) pass if (upos in ('NOUN', 'PROPN')) != (lc == 'N'): try: assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), ( sent['sent_id'], swe, tok) except AssertionError: print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr) if (upos == 'AUX') != (lc == 'AUX'): # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok) # copula has upos=AUX pass if (upos == 'VERB') != (lc == 'V'): if lc == 'ADJ': print('Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr) else: # assert tok['lemma']=='be' and lc=='V',(sent['sent_id'],tok) # copula has upos=AUX pass if upos == 'PRON': # assert lc=='PRON' or lc=='PRON.POSS',(sent['sent_id'],tok) pass if lc == 'ADV': # assert upos=='ADV' or upos=='PART',(sent['sent_id'],tok) # PART is for negations pass assert lc != 'PP', ('PP should only apply to strong MWEs', sent['sent_id'], tok) for smwe in sent['smwes'].values(): assert len(smwe['toknums']) > 1 for wmwe in sent['wmwes'].values(): assert len(wmwe['toknums']) > 1, (sent['sent_id'], wmwe) # assert wmwe['lexlemma']==' '.join(sent['toks'][i-1]['lemma'] for i in wmwe['toknums']),(wmwe,sent['toks'][wmwe['toknums'][0]-1]) # we already checked that noninitial tokens in an MWE have _ as their lemma # check lextags smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()] wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()] tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups) for tok, tag in zip(sent['toks'], tagging): fulllextag = tag if tok['smwe']: smweNum, position = tok['smwe'] lexe = sent['smwes'][smweNum] elif tok['#'] in sent['swes']: position = None lexe = sent['swes'][tok['#']] else: lexe = None if lexe and (position is None or position == 1): lexcat = lexe['lexcat'] fulllextag += '-' + lexcat ss1, ss2 = lexe['ss'], lexe['ss2'] if ss1 is not None: assert ss1 fulllextag += '-' + ss1 if ss2 is not None and ss2 != ss1: assert ss2 fulllextag += '|' + ss2 if tok['wmwe']: wmweNum, position = tok['wmwe'] wmwe = sent['wmwes'][wmweNum] wcat = wmwe['lexcat'] if wcat and position == 1: fulllextag += '+' + wcat # assert tok['lextag']==fulllextag,(sent['sent_id'],fulllextag,tok) # check rendered MWE string s = render([tok['word'] for tok in sent['toks']], smweGroups, wmweGroups) if sent['mwe'] != s: caveat = ' (may be due to simplification)' if '$1' in sent[ 'mwe'] else '' print('MWE string mismatch' + caveat + ':', s, sent['mwe'], sent['sent_id'], file=sys.stderr)
def main(args): if args.colorless or not sys.stdin.isatty(): for c in dir(Colors): if not c.startswith('_'): setattr(Colors, c, '') for s in dir(Styles): if not s.startswith('_'): setattr(Styles, s, '') goldF = args.goldfile sysFs = args.sysfile ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith('p.') else ss # Load gold data gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper)) predFs = [load_sents(predFP, ss_mapper=ss_mapper) for predFP in sysFs] all_sys_scores = {} def filter_labels(ll): result = dict(ll) for k,l in ll.items(): if l.startswith('n.') and args.no_noun: del result[k] elif l.startswith('v.') and args.no_verb: del result[k] elif l.startswith('p.') and args.no_snacs: del result[k] return result R = lambda ww,sg,wg,ll: render(ww, sg if not args.no_mwe else [], wg if not args.no_mwe else [], filter_labels(ll)) for i,sent in enumerate(gold_sents): # gold analysis words = [t["word"] for t in sent["toks"]] rendered = [] rendered.append(R(words, [e["toknums"] for e in sent["smwes"].values()], [e["toknums"] for e in sent["wmwes"].values()], makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True))) for predF in predFs: psent = next(predF) assert psent['sent_id']==sent['sent_id'] rendered.append(R(words, [e["toknums"] for e in psent["smwes"].values()], [e["toknums"] for e in psent["wmwes"].values()], makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True))) diff_classes = set() if not args.no_diff: diff_classes.add('special') if not args.no_mwe_diff: diff_classes.add('mwe') if not args.no_noun_diff: diff_classes.add('n') if not args.no_snacs_diff: diff_classes.add('p') if not args.no_verb_diff: diff_classes.add('v') if args.sent_ids: print(sent['sent_id'], end='\t') print(color_rendered(words, rendered, diff_classes)) #assert False,(color_rendered(words, rendered),words,rendered) # restore the terminal's default colors print(Colors.ENDC, end='')
def _postproc_sent(sent): nonlocal lc_tbd # check that tokens are numbered from 1, in order for i, tok in enumerate(sent['toks'], 1): assert tok['#'] == i # check that MWEs are numbered from 1 # fix_mwe_numbering.py was written to correct this for i, (k, mwe) in enumerate( sorted(chain(sent['smwes'].items(), sent['wmwes'].items()), key=lambda x: int(x[0])), 1): assert int(k) == i, (sent['sent_id'], i, k, mwe) # check that lexical & weak MWE lemmas are correct for lexe in chain(sent['swes'].values(), sent['smwes'].values()): assert lexe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in lexe['toknums'] ), f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}" lc = lexe['lexcat'] if lc.endswith('!@'): lc_tbd += 1 valid_ss = supersenses_for_lexcat(lc) if lc == 'V': assert len( lexe['toknums'] ) == 1, f'Verbal MWE lexcat must be subtyped (V.VID, etc., not V): {lexe}' ss, ss2 = lexe['ss'], lexe['ss2'] if valid_ss: if ss == '??': assert ss2 is None elif ss not in valid_ss or (lc in ( 'N', 'V') or lc.startswith('V.')) != (ss2 is None) or ( ss2 is not None and ss2 not in valid_ss): assert False, f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}" elif ss.startswith('p.'): assert ss2.startswith('p.') assert ss2 not in { 'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.OrgRole' }, (f'{ss2} should never be function', lexe) if ss != ss2: ssA, ss2A = ancestors(ss), ancestors(ss2) # there are just a few permissible combinations where one is the ancestor of the other if (ss, ss2) not in {('p.Whole', 'p.Gestalt'), ('p.Goal', 'p.Locus'), ('p.Circumstance', 'p.Locus'), ('p.Circumstance', 'p.Path'), ('p.Locus', 'p.Goal'), ('p.Locus', 'p.Source'), ('p.Characteristic', 'p.Stuff')}: assert ss not in ss2A, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" assert ss2 not in ssA, f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}" else: assert ss is None and ss2 is None and lexe not in ( 'N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'), lexe # check lexcat on single-word expressions for swe in sent['swes'].values(): tok = sent['toks'][swe['toknums'][0] - 1] upos, xpos = tok['upos'], tok['xpos'] lc = swe['lexcat'] if lc.endswith('!@'): continue assert lc in ALL_LEXCATS, f"In {sent['sent_id']}, invalid lexcat for single-word expression: {lc} in {tok}" if (xpos == 'TO') != lc.startswith('INF'): assert upos == 'SCONJ' and swe['lexlemma'] == 'for', ( sent['sent_id'], swe, tok) if (upos in ('NOUN', 'PROPN')) != (lc == 'N'): try: assert upos in ('SYM', 'X') or (lc in ('PRON', 'DISC')), ( sent['sent_id'], swe, tok) except AssertionError: print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr) if (upos == 'AUX') != (lc == 'AUX'): assert tok['lemma'] == 'be' and lc == 'V', ( sent['sent_id'], tok) # copula has upos=AUX if (upos == 'VERB') != (lc == 'V'): if lc == 'ADJ': print('Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr) else: assert tok['lemma'] == 'be' and lc == 'V', ( sent['sent_id'], tok) # copula has upos=AUX if upos == 'PRON': assert lc == 'PRON' or lc == 'PRON.POSS', (sent['sent_id'], tok) if lc == 'ADV': assert upos == 'ADV' or upos == 'PART', ( sent['sent_id'], tok) # PART is for negations assert lc != 'PP', ('PP should only apply to strong MWEs', sent['sent_id'], tok) for smwe in sent['smwes'].values(): assert len(smwe['toknums']) > 1 for wmwe in sent['wmwes'].values(): assert len( wmwe['toknums'] ) > 1, f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}" assert wmwe['lexlemma'] == ' '.join( sent['toks'][i - 1]['lemma'] for i in wmwe['toknums']), (wmwe, sent['toks'][wmwe['toknums'][0] - 1]) # we already checked that noninitial tokens in an MWE have _ as their lemma # check lextags smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()] wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()] tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups) for tok, tag in zip(sent['toks'], tagging): fulllextag = tag if tok['smwe']: smweNum, position = tok['smwe'] lexe = sent['smwes'][smweNum] else: position = None lexe = sent['swes'][tok['#']] if position is None or position == 1: lexcat = lexe['lexcat'] fulllextag += '-' + lexcat ss1, ss2 = lexe['ss'], lexe['ss2'] if ss1 is not None: assert ss1 fulllextag += '-' + ss1 if ss2 is not None and ss2 != ss1: assert ss2 fulllextag += '|' + ss2 if tok['wmwe']: wmweNum, position = tok['wmwe'] wmwe = sent['wmwes'][wmweNum] wcat = wmwe['lexcat'] if wcat and position == 1: fulllextag += '+' + wcat assert tok[ 'lextag'] == fulllextag, f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}" # check rendered MWE string s = render([tok['word'] for tok in sent['toks']], smweGroups, wmweGroups) if sent['mwe'] != s: caveat = ' (may be due to simplification)' if '$1' in sent[ 'mwe'] else '' print(f'MWE string mismatch{caveat}:', s, sent['mwe'], sent['sent_id'], file=sys.stderr)