def print_analyses_ftb(wordn, surf, anal, outfile, hacks=None): pos = format_xpos_ftb(anal) print(wordn, surf, "#".join(get_lemmas(anal)), pos, pos, format_feats_ftb(anal), "_", "_", "_", "_", sep="\t", file=outfile)
def print_analyses_conllu(wordn, surf, anal, outfile, hacks=None): upos = get_upos(anal) if not upos or upos == "": upos = 'X' if hacks == 'ftb': third = format_xpos_ftb(anal) else: third = format_xpos_tdt(anal) print(wordn, surf, "#".join(get_lemmas(anal)), upos, third, format_feats_ud(anal, hacks), "_", "_", "_", format_misc_ud(anal), sep="\t", file=outfile)
def try_analyses_ftb(original, wordn, surf, anals, outfile, hacks=None): for anal in anals: pos = format_xpos_ftb(anal) if pos == original[3]: feats = format_feats_ftb(anal) if feats == original[5]: lemmas = "#".join(get_lemmas(anal)) if lemmas == original[2]: return print_analyses_ftb(wordn, surf, anal, outfile) # no exact match found (re-try without lemma) for anal in anals: upos = format_xpos_ftb(anal) if upos == original[3]: feats = format_feats_ftb(anal) if feats == original[5]: return print_analyses_ftb(wordn, surf, anal, outfile) # and re-try without feats for anal in anals: upos = format_xpos_ftb(anal) if upos == original[3]: return print_analyses_ftb(wordn, surf, anal, outfile) return print_analyses_ftb(wordn, surf, anals[0], outfile)
def main(): a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-f', '--format', metavar="FORMAT", help="use FORMAT formatter to compare analyses", choices=["coverage", "ftb3.1"], default="coverage") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = surf analysis = surf if options.format != 'coverage': lemma = fields[2] analysis = fields[3] lines += freq if options.verbose: print(lines, '(', freq, ') ...', end='\r') anals = omorfi.analyse(surf) if not is_tokenlist_oov(anals): covered += freq else: no_results += freq print("OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False for anal in anals: if options.format == 'ftb3.1': anal_ftb3 = format_feats_ftb(anal) lemma_ftb3 = '#'.join(get_lemmas(anal)) # hacks ftb3: analysis = analysis.replace(" >>>", "") if analysis == anal_ftb3: found_anals = True print("ANALHIT", analysis, anal_ftb3, file=options.outfile) elif set(anal_ftb3.split()) == set(analysis.split()): found_anals = True print("PERMUTAHIT", analysis, anal_ftb3, file=options.outfile) else: print("ANALMISS", analysis, anal_ftb3, file=options.outfile) if lemma == lemma_ftb3: found_lemma = True print("LEMMAHIT", lemma, lemma_ftb3, file=options.outfile) elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''): found_lemma = True print("LEMMARECOMP", lemma, lemma_ftb3, file=options.outfile) else: print("LEMMAMISS", lemma, lemma_ftb3, file=options.outfile) if options.format != 'coverage': if not found_anals and not found_lemma: no_matches += freq print("NOHITS!", surf, sep='\t', file=options.outfile) elif found_anals and found_lemma: print("HIT", surf, sep='\t', file=options.outfile) full_matches += freq elif not found_anals: anal_matches += freq print("LEMMANOANAL", surf, sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += freq print("ANALNOLEMMA", surf, sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if options.format == 'ftb3.1': print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) elif options.format == 'ftb3.1' and \ (full_matches / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif options.format == 'coverage' and \ (covered / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)