def main(args): with open(args.fname, encoding="utf-8") as f, open(args.lextags, encoding="utf-8") as lextags_lines: print_json( swap_lextags(load_sents(f, ss_mapper=SSMapper(args.depth)), map(ast.literal_eval, lextags_lines)))
def main(args): goldF = args.goldfile sysFs = args.sysfile ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith( 'p.') else ss # Load gold data gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper)) for sent in gold_sents: sent['punits'] = { tuple(e['toknums']): (e['lexcat'], e['ss'], e['ss2']) for e in list(sent['swes'].values()) + list(sent['smwes'].values()) if e['ss'] and (e['ss'].startswith('p.') or e['ss'] == '??') } all_sys_scores = {} for sysF in sysFs: sysscores = eval_sys(sysF, gold_sents, ss_mapper) syspath = sysF.name basename = syspath.rsplit('.', 2)[0] if basename not in all_sys_scores: all_sys_scores[basename] = [ defaultdict(lambda: defaultdict(Counter)), defaultdict(lambda: defaultdict(Counter)) ] if syspath.split('.')[-2] == 'goldid': all_sys_scores[basename][0] = sysscores else: all_sys_scores[basename][1] = sysscores # Print output args.output_format(all_sys_scores, depth=args.depth)
def __call__(self, batch_tags: List[List[str]], batch_gold_tags: List[List[str]], batch_upos: List[List[str]]): tempdir = tempfile.mkdtemp() gold_path = os.path.join(tempdir, "gold.json") predicted_path = os.path.join(tempdir, "predicted.autoid.json") # TODO(danielhers): Unused variable: # unpacked_predicted_path = os.path.join(tempdir, "unpacked_predicted.autoid.json") with open(predicted_path, "w", encoding="utf-8") as predicted_file, \ open(gold_path, "w", encoding="utf-8") as gold_file: for tags, gold_tags, upos in zip(batch_tags, batch_gold_tags, batch_upos): write_conllulex_formatted_tags_to_file(predicted_file, gold_file, tags, gold_tags, upos) with open(predicted_path, encoding="utf-8") as predicted_file: # TODO(danielhers): Unused variable: # \ open(unpacked_predicted_path, "w", encoding="utf-8") as unpacked_predicted_file: print_json(unpack_sents(predicted_file)) with open(gold_path, encoding="utf-8") as gold_file: # TODO(danielhers): Unused variable: # \ open(unpacked_predicted_path, encoding="utf-8") as unpacked_predicted_file: gold_sents = list(load_sents(gold_file, ss_mapper=ss_mapper)) self._scores = eval_sys(predicted_file, gold_sents, ss_mapper) # TODO accumulate
def main(args): goldF = args.goldfile sysFs = args.sysfile ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith( 'p.') else ss # Load gold data gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper)) all_sys_scores = {} for sysF in sysFs: sysscores = eval_sys(sysF, gold_sents, ss_mapper) syspath = sysF.name basename = syspath.rsplit('.', 2)[0] if basename not in all_sys_scores: all_sys_scores[basename] = [ defaultdict(lambda: defaultdict(Counter)), defaultdict(lambda: defaultdict(Counter)) ] if syspath.split('.')[-2] == 'goldid': all_sys_scores[basename][0] = sysscores else: all_sys_scores[basename][1] = sysscores # Print output args.output_format(all_sys_scores, depth=args.depth, mode=args.output_mode)
def main(args): ss_mapper = SSMapper(args.depth) # Load gold data gold_sents = list( tqdm(load_sents(args.goldfile, ss_mapper=ss_mapper), desc="Reading " + args.goldfile.name, unit=" lines")) all_sys_scores = {} for lextags_file in args.lextags: # Load predictions with open(lextags_file, encoding="utf-8") as f: pred_sents = list( tqdm(swap_lextags(gold_sents, map(ast.literal_eval, f)), desc="Reading " + lextags_file, unit=" lines")) s = StringIO() print_json(pred_sents, fh=s) s = BytesIO(s.getvalue().encode("utf-8")) s.name = "autoid.json" scores = eval_sys(s, gold_sents, ss_mapper) basename = lextags_file.rsplit('.', 2)[0] if basename not in all_sys_scores: all_sys_scores[basename] = [ defaultdict(lambda: defaultdict(Counter)), defaultdict(lambda: defaultdict(Counter)) ] if lextags_file.split('.')[-2] == 'goldid': all_sys_scores[basename][0] = scores else: all_sys_scores[basename][1] = scores # Print output args.output_format(all_sys_scores, depth=args.depth, mode=args.output_mode)
def main(args): with open(args.fname, encoding="utf-8") as f, open(args.lextags, encoding="utf-8") as tags_lines: sents = load_sents(f, ss_mapper=SSMapper(args.depth), validate_type=False, validate_pos=False) preds = load_tags(tags_lines) print_json(swap_tags(sents, preds))
def eval_sys(sysF, gold_sents, ss_mapper): goldid = (sysF.name.split('.')[-2]=='goldid') if not goldid and sysF.name.split('.')[-2]!='autoid': raise ValueError(f'File path of system output not specified for gold vs. auto identification of units to be labeled: {sysF.name}') compare_sets = compare_sets_Acc if goldid else compare_sets_PRF scores = defaultdict(lambda: defaultdict(Counter)) for iSent,syssent in enumerate(load_sents(sysF, ss_mapper=ss_mapper)): sent = gold_sents[iSent] assert sent['sent_id']==syssent['sent_id'] eval_sent_tagging(sent, syssent, scores) for shapeclass in SHAPE_CLASSES: for ssclass in SS_CLASSES: eval_sent_by_classes(sent, syssent, shapeclass, ssclass, scores, compare_sets) for k in scores: if k[1] =='Tags': if k[0]=='*': # k is ('*', 'Tags') for subscore in ('Full', '-Lexcat', '-SS', '-Lexcat -SS'): c = scores[k][subscore] assert scores[k][subscore]['N']>0,(k,subscore,scores[k][subscore]) c['Acc'] = Ratio(c['correct'], c['N']) elif k[0] in ('MWE', 'GappyMWE'): for subscore in ('Link+', 'Link-'): c = scores[k][subscore] c['P'] = Ratio(c['PNumer'], c['PDenom']) c['R'] = Ratio(c['RNumer'], c['RDenom']) c['F'] = f1(c['P'], c['R']) for m in ('P', 'R', 'F'): # strength averaging avg = (scores[k]['Link+'][m]+scores[k]['Link-'][m])/2 # float # construct a ratio by averaging the denominators (this gives insight into underlying recall-denominators) denom = (scores[k]['Link+'][m].denominator+scores[k]['Link-'][m].denominator)/2 # float scores[k]['LinkAvg'][m] = Ratio(avg*denom, denom) elif goldid: # assuming goldid means gold identification of spans & kind of supersense for subscore in ('Role','Fxn','Labeled'): c = scores[k][subscore] assert scores[k][subscore]['N']>0,(k,subscore,scores[k][subscore]) c['Acc'] = Ratio(c['correct'], c['N']) else: for subscore in ('ID','Role','Fxn','Labeled'): c = scores[k][subscore] c['P'] = Ratio(c['correct'], c['Pdenom']) c['R'] = Ratio(c['correct'], c['Rdenom']) c['F'] = f1(c['P'], c['R']) assert len(gold_sents)==iSent+1,f'Mismatch in number of sentences: {len(gold_sents)} gold, {iSent+1} system from {sysFP}' return scores
def _read(self, file_path): # if `file_path` is a URL, redirect to the cache file_path = cached_path(file_path) logger.info("Reading instances from lines in file at: %s", file_path) with open(file_path, 'r') as tagging_file: tagging_data = load_sents(tagging_file) for instance in tagging_data: # Get the tokens tokens = [x["word"] for x in instance["toks"]] # Get their associated upos upos_tags = [x["upos"] for x in instance["toks"]] # Get their associated lemma lemmas = [x["lemma"] for x in instance["toks"]] # Get their associated lextag labels = [x["lextag"] for x in instance["toks"]] yield self.text_to_instance(tokens=tokens, upos_tags=upos_tags, lemmas=lemmas, streusle_lextags=labels)
updates[sentid] = r else: # continuation of second column from previous line assert sentid updates[sentid] += ' ' + r """ 2. Scan the full corpus .conllulex for sentences with their original annotations. If there was a change, parse the rendered lexical semantic analysis into tags, substitute the tags in the UDlextag format, and parse the sentence to JSON in order to update the fields: 'mwe', 'toks', 'swes', 'smwes', 'wmwes' ('etoks' etc. will be unaffected). """ sents = [] with open(conllulexFP, encoding='utf-8') as conllulexF: nUpdatedSents = 0 for sent in load_sents(conllulexF, store_conllulex='toks'): sentid = sent['sent_id'] if sentid in updates: # compare rendered strings to see whether there has been a change rendered_old = render_sent(sent, lexcats=True, supersenses=True) rendered_new = updates[sentid] if rendered_old!=rendered_new: # there has been a change # parse the new rendered string toks = [tok['word'] for tok in sent['toks']] tagging = unrender(rendered_new, toks) # this should fail if tokens have changed toks2, bios, lbls = zip(*tagging) assert toks==list(toks2),(toks,toks2) # be super-duper sure tokens haven't changed labeled_bio = [bio+('-'+lbl.replace(':','|') if lbl else '') for bio,lbl in zip(bios,lbls)] # substitute new tagging in UDlextag format conllulex = sent['conllulex'].strip().split('\n')
def eval_sys(sysF, gold_sents, ss_mapper): goldid = (sysF.name.split('.')[-2] == 'goldid') if not goldid and sysF.name.split('.')[-2] != 'autoid': raise ValueError( f'File path of system output not specified for gold vs. auto identification of units to be labeled: {sysF.name}' ) compare_sets = compare_sets_Acc if goldid else compare_sets_PRF scores = { 'All': defaultdict(Counter), 'MWE': defaultdict(Counter), 'MWP': defaultdict(Counter) } for iSent, syssent in enumerate(load_sents(sysF, ss_mapper=ss_mapper)): sent = gold_sents[iSent] assert sent['sent_id'] == syssent['sent_id'] # all units with a PSS label c = scores['All'] goldunits = dict( sent['punits'] ) # make a copy so we can delete stuff locally for gold=?? and not have it affect other results predunits = { tuple(e['toknums']): (e['lexcat'], e['ss'], e['ss2']) for e in list(syssent['swes'].values()) + list(syssent['smwes'].values()) if e['ss'] and e['ss'].startswith('p.') } # special case: discard gold=?? tokens regardless of their predicted label for k, (lc, r, f) in list(goldunits.items()): if r == '??': if k in predunits: del predunits[k] del goldunits[k] c['ID'] += compare_sets(set(goldunits.keys()), set(predunits.keys())) c['Role,Fxn'] += compare_sets( {(k, r, f) for k, (lc, r, f) in goldunits.items()}, {(k, r, f) for k, (lc, r, f) in predunits.items()}) c['Role'] += compare_sets( {(k, r) for k, (lc, r, f) in goldunits.items()}, {(k, r) for k, (lc, r, f) in predunits.items()}) c['Fxn'] += compare_sets( {(k, f) for k, (lc, r, f) in goldunits.items()}, {(k, f) for k, (lc, r, f) in predunits.items()}) # MWEs only c = scores['MWE'] goldunits = {k: v for k, v in goldunits.items() if len(k) > 1} predunits = {k: v for k, v in predunits.items() if len(k) > 1} c['ID'] += compare_sets(set(goldunits.keys()), set(predunits.keys())) c['Role,Fxn'] += compare_sets( {(k, r, f) for k, (lc, r, f) in goldunits.items()}, {(k, r, f) for k, (lc, r, f) in predunits.items()}) c['Role'] += compare_sets( {(k, r) for k, (lc, r, f) in goldunits.items()}, {(k, r) for k, (lc, r, f) in predunits.items()}) c['Fxn'] += compare_sets( {(k, f) for k, (lc, r, f) in goldunits.items()}, {(k, f) for k, (lc, r, f) in predunits.items()}) # multiword adpositions only: note this requires the lexcat to be predicted c = scores['MWP'] goldunits = {k: v for k, v in goldunits.items() if v[0] != 'PP'} predunits = {k: v for k, v in predunits.items() if v[0] != 'PP'} c['ID'] += compare_sets(set(goldunits.keys()), set(predunits.keys())) c['Role,Fxn'] += compare_sets( {(k, r, f) for k, (lc, r, f) in goldunits.items()}, {(k, r, f) for k, (lc, r, f) in predunits.items()}) c['Role'] += compare_sets( {(k, r) for k, (lc, r, f) in goldunits.items()}, {(k, r) for k, (lc, r, f) in predunits.items()}) c['Fxn'] += compare_sets( {(k, f) for k, (lc, r, f) in goldunits.items()}, {(k, f) for k, (lc, r, f) in predunits.items()}) for k in ('All', 'MWE', 'MWP'): if goldid: for criterion in ('Role', 'Fxn', 'Role,Fxn'): c = scores[k][criterion] assert scores[k][criterion]['N'] > 0, (k, criterion, scores[k][criterion]) c['Acc'] = c['correct'] / c['N'] else: for criterion in ('ID', 'Role', 'Fxn', 'Role,Fxn'): c = scores[k][criterion] c['P'] = c['correct'] / c['Pdenom'] c['R'] = c['correct'] / c['Rdenom'] c['F'] = f1(c['P'], c['R']) assert len( gold_sents ) == iSent + 1, f'Mismatch in number of sentences: {len(gold_sents)} gold, {iSent+1} system from {sysFP}' return scores
def main(args): if args.colorless or not sys.stdin.isatty(): for c in dir(Colors): if not c.startswith('_'): setattr(Colors, c, '') for s in dir(Styles): if not s.startswith('_'): setattr(Styles, s, '') goldF = args.goldfile sysFs = args.sysfile ss_mapper = lambda ss: coarsen_pss(ss, args.depth) if ss.startswith('p.') else ss # Load gold data gold_sents = list(load_sents(goldF, ss_mapper=ss_mapper)) predFs = [load_sents(predFP, ss_mapper=ss_mapper) for predFP in sysFs] all_sys_scores = {} def filter_labels(ll): result = dict(ll) for k,l in ll.items(): if l.startswith('n.') and args.no_noun: del result[k] elif l.startswith('v.') and args.no_verb: del result[k] elif l.startswith('p.') and args.no_snacs: del result[k] return result R = lambda ww,sg,wg,ll: render(ww, sg if not args.no_mwe else [], wg if not args.no_mwe else [], filter_labels(ll)) for i,sent in enumerate(gold_sents): # gold analysis words = [t["word"] for t in sent["toks"]] rendered = [] rendered.append(R(words, [e["toknums"] for e in sent["smwes"].values()], [e["toknums"] for e in sent["wmwes"].values()], makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True))) for predF in predFs: psent = next(predF) assert psent['sent_id']==sent['sent_id'] rendered.append(R(words, [e["toknums"] for e in psent["smwes"].values()], [e["toknums"] for e in psent["wmwes"].values()], makelabelmap(sent, include_lexcat=args.lexcats, include_supersenses=True))) diff_classes = set() if not args.no_diff: diff_classes.add('special') if not args.no_mwe_diff: diff_classes.add('mwe') if not args.no_noun_diff: diff_classes.add('n') if not args.no_snacs_diff: diff_classes.add('p') if not args.no_verb_diff: diff_classes.add('v') if args.sent_ids: print(sent['sent_id'], end='\t') print(color_rendered(words, rendered, diff_classes)) #assert False,(color_rendered(words, rendered),words,rendered) # restore the terminal's default colors print(Colors.ENDC, end='')
@author: Nathan Schneider (@nschneid) """ import os, sys, fileinput, re, json, csv from collections import defaultdict from itertools import chain from conllulex2json import load_sents, print_json inFname, = sys.argv[1:] nSentsRenumbered = 0 nMWEsRenumbered = 0 with open(inFname, encoding='utf-8') as inF: sents = list(load_sents(inF)) for sent in sents: smwes = sent["smwes"] wmwes = sent["wmwes"] allmwes = [] for oldnum, e in smwes.items(): allmwes.append((e["toknums"][0], 's', oldnum)) for oldnum, e in wmwes.items(): allmwes.append((e["toknums"][0], 'w', oldnum)) allmwes.sort() current_sort = sorted(allmwes, key=lambda x: x[2]) if allmwes != current_sort: nSentsRenumbered += 1 # renumber new_smwes = {} new_wmwes = {}
parser = argparse.ArgumentParser(description='Augment Data') parser.add_argument("conllulex", type=str, help="Augment CoNLL-U/CoNLL-U-Lex/JSON file") parser.add_argument("mrp", type=str, help="Input MRP file") parser.add_argument("output", type=str, help="Output Augmented file") args = parser.parse_args() conllulex_file = args.conllulex mrp_file = args.mrp out_file = args.output with open(conllulex_file, 'r', encoding='utf8') as f_c: augs = { sent["sent_id"].replace("reviews-", ""): sent for sent in load_sents(CompanionToConllulex(f_c)) } with open(mrp_file, 'r', encoding='utf8') as f_in, open(out_file, 'w', encoding='utf8') as f_out: for line in f_in: mrp = json.loads(line, object_pairs_hook=collections.OrderedDict) sent_id = mrp['id'] aug = augs[sent_id] if aug is None: print("id:{} not in companion".format(sent_id)) else: add_token_ranges(aug["toks"], aug["text"]) mrp['companion'] = aug f_out.write((json.dumps(mrp) + '\n'))