# This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License version 3 as # published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. from omorfi.omorfi import Omorfi omorfi = Omorfi() omorfi.load_analyser("/usr/local/share/omorfi/omorfi.analyse.hfst") omorfi.load_generator("/usr/local/share/omorfi/omorfi.generate.hfst") import settings PROPERTIES = { "nominatiivi": [("CASE", "NOM")], "genetiivi": [("CASE", "GEN")], "partitiivi": [("CASE", "PAR")], "translatiivi": [("CASE", "TRA")], "essiivi": [("CASE", "ESS")], "inessiivi": [("CASE", "INE")], "elatiivi": [("CASE", "ELA")], "illatiivi": [("CASE", "ILL")], "adessiivi": [("CASE", "ADE")], "ablatiivi": [("CASE", "ABL")],
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-F', '--format', metavar="INFORMAT", default='text', help="read input using INFORMAT tokenisation", choices=['text', 'vislcg', 'conllu']) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is required to vislcg", file=stderr) exit(4) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: if options.outfile == stdout: options.statfile = stdout else: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 last = None for line in options.infile: surfs = [] if options.format == 'vislcg': surfs = get_line_tokens_vislcg(line, last) elif options.format == 'text': surfs = get_line_tokens(line, omorfi) elif options.format == 'conllu': surfs = get_line_tokens_conllu(line, last) else: print("input format missing implementation", options.format, file=stderr) exit(2) for surf in surfs: if 'conllu_form' in surf: # skip conllu special forms in input for now: # (ellipsis and MWE magics) continue elif 'surf' in surf: tokens += 1 anals = omorfi.analyse(surf) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0]['anal']): unknowns += 1 anals = omorfi.guess(surf) print_analyses_vislcg3(surf, anals, options.outfile) elif 'comment' in surf: if surf['comment'].startswith(';') or \ surf['comment'].startswith('\t'): continue else: print(surf['comment'], file=options.outfile) elif 'error' in surf: print(surf['error'], file=stderr) exit(2) last = surf cpuend = process_time() realend = perf_counter() print("# Tokens:", tokens, "\n# Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("# CPU time:", cpuend - cpustart, "\n# Real time:", realend - realstart, file=options.statfile) print("# Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load tokeniser model from (analyser) AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu', 'json', 'ftb3']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading language model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed for tokenisation", file=stderr) exit(1) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# new doc id=", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf['surf'] for surf in surfs]), file=options.outfile) elif options.output_format == 'json': print(json.encode(surfs)) elif options.output_format == 'conllu': print("# sent_id =", lines, file=options.outfile) print("# text =", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", format_misc_ud(surf), sep="\t", file=options.outfile) i += 1 elif options.output_format == 'ftb3': print("<s><loc file=\"", options.infile.name, "\" line=\"", lines, "\" />", file=options.outfile, sep="") i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 print("</s>", file=options.outfile) if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle analyses to match HACKS version of UD", choices=['ftb']) a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to conllu", file=stdrr) exit(4) if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout lexprobs = None tagprobs = None if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 recognised_comments = [ 'sent_id =', 'text =', 'doc-name:', 'sentence-text:' ] for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # conllu is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: if '-' in fields[0]: # MWE continue elif '.' in fields[0]: # a ghost continue else: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'OOV' in anals[0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.debug: debug_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) elif options.oracle: try_analyses_conllu(fields, index, surf, anals, options.outfile, options.hacks) else: print_analyses_conllu(index, surf, anals[0], options.outfile, options.hacks) else: print("Failed:", fields) exit(1) elif line.startswith('#'): print(line.strip(), file=options.outfile) recognised = False for rec in recognised_comments: if line.startswith('# ' + rec): recognised = True if not recognised and options.verbose: print("Warning! Unrecognised comment line:", line, sep='\n') elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in conllu format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE", required=True) a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter model from SFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("Reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("at least analyser file is needed", file=stderr) exit(1) if options.segmenter: if options.verbose: print("Reading segmenter model", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("at least segmenter file is needed", file=stderr) exit(1) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", infile.name) if options.verbose: print("writign to", outfile.name) re_lemma = re.compile("\[WORD_ID=([^]]*)\]") re_pos = re.compile("\[UPOS=([^]]*)\]") re_mrd = re.compile("\[([^=]*)=([^]]*)]") linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = line.split() for surf in surfs: anals = omorfi.analyse(surf) segments = omorfi.segment(surf) pos_matches = re_pos.finditer(anals[0]['anal']) pos = "UNK" mrds = [] lemmas = [] for pm in pos_matches: pos = pm.group(1) lemma_matches = re_lemma.finditer(anals[0]['anal']) for lm in lemma_matches: lemmas += [lm.group(1)] mrd_matches = re_mrd.finditer(anals[0]['anal']) for mm in mrd_matches: if mm.group(1) == 'WORD_ID': mrds = [] elif mm.group(1) == 'WEIGHT': pass else: mrds += [mm.group(2)] parts = segments[0]['segments'] if '{DB}' in parts: suffixes = parts[parts.rfind('{DB}') + 4:] elif '{WB}' in parts: suffixes = parts[parts.rfind('{WB}') + 4:] elif '{hyph?}' in parts: suffixes = parts[parts.rfind('{hyph?}') + 6:] else: suffixes = "0" morphs = suffixes[suffixes.find("{"):].replace("{MB}", ".") print(surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE", required=True) a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter model from SFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("Reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("at least analyser file is needed", file=stderr) exit(1) if options.segmenter: if options.verbose: print("Reading segmenter model", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("at least segmenter file is needed", file=stderr) exit(1) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", infile.name) if options.verbose: print("writign to", outfile.name) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue tokens = omorfi.tokenise_sentence(line) for token in tokens: if not token.surf: continue anals = omorfi.analyse(token) pos = "X" mrds = ["?"] lemmas = [token.surf] if anals: anal = token.get_best() pos = anal.get_upos() mrds = anal.get_ufeats() lemmas = anal.get_lemmas() segments = omorfi.segment(token) morphs = "0" if segments: segment = token.get_best_segments() if segment: parts = segment.get_segments() morphs = ".".join(parts) else: morphs = token.surf print(token.surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to ftb3", file=stderr) exit(4) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout lexprobs = None tagprobs = None if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # ftb is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: print("Cannot figure out token index", fields[0], file=stderr) exit(1) surf = fields[1] anals = omorfi.analyse(surf) if not anals or len(anals) == 0 or (len(anals) == 1 and 'OOV' in anals[0]): unknowns += 1 anals = omorfi.guess(surf) if anals and len(anals) > 0: if options.oracle: try_analyses_ftb(fields, index, surf, anals, options.outfile) else: print_analyses_ftb(index, surf, anals[0], options.outfile) else: print("Failed:", fields) exit(1) elif line.startswith('<') and line.rstrip().endswith('>'): print(line.strip(), file=options.outfile) elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in ftb3 format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-f', '--format', metavar="FORMAT", help="use FORMAT formatter to compare analyses", choices=["coverage", "ftb3.1"], default="coverage") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = surf analysis = surf if options.format != 'coverage': lemma = fields[2] analysis = fields[3] lines += freq if options.verbose: print(lines, '(', freq, ') ...', end='\r') anals = omorfi.analyse(surf) if not is_tokenlist_oov(anals): covered += freq else: no_results += freq print("OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False for anal in anals: if options.format == 'ftb3.1': anal_ftb3 = format_feats_ftb(anal) lemma_ftb3 = '#'.join(get_lemmas(anal)) # hacks ftb3: analysis = analysis.replace(" >>>", "") if analysis == anal_ftb3: found_anals = True print("ANALHIT", analysis, anal_ftb3, file=options.outfile) elif set(anal_ftb3.split()) == set(analysis.split()): found_anals = True print("PERMUTAHIT", analysis, anal_ftb3, file=options.outfile) else: print("ANALMISS", analysis, anal_ftb3, file=options.outfile) if lemma == lemma_ftb3: found_lemma = True print("LEMMAHIT", lemma, lemma_ftb3, file=options.outfile) elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''): found_lemma = True print("LEMMARECOMP", lemma, lemma_ftb3, file=options.outfile) else: print("LEMMAMISS", lemma, lemma_ftb3, file=options.outfile) if options.format != 'coverage': if not found_anals and not found_lemma: no_matches += freq print("NOHITS!", surf, sep='\t', file=options.outfile) elif found_anals and found_lemma: print("HIT", surf, sep='\t', file=options.outfile) full_matches += freq elif not found_anals: anal_matches += freq print("LEMMANOANAL", surf, sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += freq print("ANALNOLEMMA", surf, sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if options.format == 'ftb3.1': print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) elif options.format == 'ftb3.1' and \ (full_matches / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif options.format == 'coverage' and \ (covered / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)