def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-f', '--format', metavar="FORMAT", help="use FORMAT formatter to compare analyses", choices=["coverage", "ftb3.1"], default="coverage") a.add_argument('-c', '--count', metavar="FREQ", default=0, help="test only word-forms with frequency higher than FREQ") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 only_permuted = 0 only_rehashed = 0 no_matches = 0 no_results = 0 lines = 0 # types types_covered = 0 types_no_results = 0 types = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().replace(' ', '\t', 1).split('\t') if len(fields) < 2: print("ERROR: Skipping line", fields, file=stderr) continue freq = int(fields[0]) if freq < int(options.count): break surf = fields[1] lemma = surf analysis = surf if options.format != 'coverage': lemma = fields[2] analysis = fields[3] lines += freq types += 1 if options.verbose: print(lines, '(', freq, ') ...', end='\r') token = Token(surf) # pos 1 triggers acceptable detitlecasing token.pos = 1 omorfi.analyse(token) if token.is_oov(): omorfi.guess(token) if not token.is_oov(): covered += freq types_covered += 1 else: no_results += freq types_no_results += 1 print(freq, "OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False rehashed = True permuted = True for anal in token.analyses: if options.format == 'ftb3.1': anal_ftb3 = ' '.join(anal.get_ftb_feats()) lemma_ftb3 = '#'.join(anal.get_lemmas()) # hacks ftb3: analysis = analysis.replace(" >>>", "") if analysis == anal_ftb3: found_anals = True permuted = False elif set(anal_ftb3.split()) == set(analysis.split()): found_anals = True print(freq, "PERMUTAHIT", analysis, anal_ftb3, sep='\t', file=options.outfile) else: print(freq, "ANALMISS", analysis, anal_ftb3, sep='\t', file=options.outfile) if lemma == lemma_ftb3: found_lemma = True rehashed = False elif lemma.replace('#', '') == lemma_ftb3.replace('#', ''): found_lemma = True print(freq, "LEMMARECOMP", lemma, lemma_ftb3, sep='\t', file=options.outfile) else: print(freq, "LEMMAMISS", lemma, lemma_ftb3, sep='\t', file=options.outfile) if options.format != 'coverage': if not found_anals and not found_lemma: no_matches += freq print(freq, "NOHITS!", surf, sep='\t', file=options.outfile) elif found_anals and found_lemma: full_matches += freq elif not found_anals: anal_matches += freq print(freq, "LEMMANOANAL", surf, sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += freq print(freq, "ANALNOLEMMA", surf, sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) if rehashed: only_rehashed += freq if permuted: only_permuted += freq realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Types", "Covered", "OOV", sep="\t", file=options.statfile) print(types, types_covered, types - types_covered, sep="\t", file=options.statfile) print(types / types * 100 if types != 0 else 0, types_covered / types * 100 if types != 0 else 0, (types - types_covered) / types * 100 if types != 0 else 0, sep="\t", file=options.statfile) if options.format == 'ftb3.1': print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Of which", "Tag permuations", "Lemma rehashing", sep='\t', file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, only_permuted / lines * 100 if lines != 0 else 0, only_rehashed / lines * 100 if lines != 0 else 0, sep='\t', file=options.statfile) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) elif options.format == 'ftb3.1' and \ (full_matches / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif options.format == 'coverage' and \ (covered / lines * 100 <= int(options.threshold)): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-g', '--generator', metavar='FSAFILE', required=True, help="load generator from FSAFILE") a.add_argument('-w', '--word', metavar="WORD_ID", required=True, help="generate forms of word WORD_ID") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-O', '--output-format', metavar="OFORMAT", default="markdown", help="Create output table in OFORMAT") a.add_argument('-u', '--upos', metavar="UPOS", required=True, choices=["ADJ", "NOUN", "VERB", "NUM", "X"], help="generate inflection table for UPOS") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.generator: if options.verbose: print("reading generator from", options.generator) omorfi.load_generator(options.generator) if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.generator, file=stderr) exit(2) # for make check target realstart = perf_counter() cpustart = process_time() print("### Inflection of", options.word, file=options.outfile) print(file=options.outfile) if options.upos == 'NOUN': print_nominals(omorfi, options.word, options.upos, options.outfile) elif options.upos == 'ADJ': print_comparatives(omorfi, options.word, options.upos, 'POS', options.outfile) # comparisons print(file=options.outfile) print_comparatives(omorfi, options.word, options.upos, "CMP", options.outfile) print(file=options.outfile) print_comparatives(omorfi, options.word, options.upos, "SUP", options.outfile) elif options.upos == 'NUM': print_numerals(omorfi, options.word, options.upos, options.outfile) elif options.upos == 'VERB': print_finites(omorfi, options.word, options.upos, options.outfile) print(file=options.outfile) print_infinites(omorfi, options.word, options.upos, options.outfile) print(file=options.outfile) print("_Note:_ the inflection tables cover small percentage of the " + "whole inflectional paradigm, for full list, see [" + options.word + " full form list](" + options.word + ".html)", file=options.outfile) print(file=options.outfile) realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="read analyser model from AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to ftb3", file=stderr) exit(4) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 for line in options.infile: fields = line.strip().split('\t') if len(fields) == 10: # ftb is 10 field format tokens += 1 try: index = int(fields[0]) except ValueError: print("Cannot figure out token index", fields[0], file=stderr) exit(1) token = Token(fields[1]) token.pos = int(fields[0]) omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) if options.oracle: try_analyses_ftb(fields, index, token, options.outfile) else: print_analyses_ftb(index, token, options.outfile) elif line.startswith('<') and line.rstrip().endswith('>'): print(line.strip(), file=options.outfile) elif not line or line.strip() == '': # retain exactly 1 empty line between sents print(file=options.outfile) sentences += 1 else: print("Error in ftb3 format: '", line, "'", file=stderr) exit(1) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): global total_token_count, sent a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help="HFST's optimised lookup binary data for the transducer to be applied") a.add_argument( '-i', '--input', metavar="INFILE", type=str, required=True, dest="infile", help="source of analysis data") a.add_argument( '-m', '--master', metavar="TSVFILE", type=str, required=True, dest="tsvfile", help="source of existing lexical data") opts = a.parse_args() if opts.infile: test_corpora_files = glob(opts.infile) else: test_corpora_files = glob("*.text") # hard-coded logs for now # lemma_log = open('missing_word_ids.log', 'w') # case_log = open('missing_nominal_cases.log', 'w') # comp_log = open('missing_comparatives.log', 'w') # adposition_log = open('adposition_complements.log', 'w') # adposition_stats = open('adposition_complements_full.log', 'w') # adjective_log = open('adjective_agreements.log', 'w') proper_stats = open('proper_contexts_full.log', 'w') # open('../src/probabilistics/lemmas.freqs', 'w') lemma_stats = open('lemmas.freqs', 'w') # case_stats = open('../src/probabilistics/cases.freqs', 'w') omorfi = Omorfi() omorfi.load_filename(opts.fsa) gather_lemmas(open(opts.tsvfile)) test_corpora = list() for test_corpus_file in test_corpora_files: try: test_corpora.append(open(test_corpus_file)) except IOError as ioe: print("Failed to open corpus ", test_corpus_file, ":", ioe) for test_corpus in test_corpora: print('lines from', test_corpus) linen = 0 for line in test_corpus: linen += 1 if (linen % 500000) == 0: print( linen, "...! Time to reload everything because memory is leaking very badly indeed!") sent = list() omorfi = None omorfi = Omorfi() omorfi.load_filename(opts.fsa) gc.collect() if (linen % 1000) == 0: print(linen, "...", end='\r') for punct in ".,:;?!()": line = line.replace(punct, " " + punct + " ") for token in line.split(): total_token_count += 1 analyses = omorfi.analyse(token) add_to_sent(analyses, token) stat_word_ids(token, analyses) # stat_nominal_cases(token, analyses, case_log) # stat_adjective_comps(token, analyses, comp_log) print("Testing statistics") # test_zero_lemmas(lemma_log) # test_zero_cases(case_log) # test_zero_comps(comp_log) # test_case_deviations() # test_adposition_complements(adposition_log) # test_adjective_agreements(adjective_log) print("Writing accurate statistics") # print_adposition_stats(adposition_stats) print_proper_stats(proper_stats) print_lemma_stats(lemma_stats) # print_case_stats(case_stats) exit(0)
def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics covered = 0 full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 only_permuted = 0 accfails = 0 lines = 0 # for make check target threshold = options.threshold realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().split('\t') if len(fields) < 3: print("ERROR: Skipping line", fields, file=stderr) continue if ' ' in fields[1] or ' ' in fields[0]: continue lines += 1 if options.verbose and lines % 1000 == 0: print(lines, '...') lemma = fields[0] surf = fields[1] unimorph = fields[2].replace('ACC', 'NOM').replace('GEADJ', 'GEN') token = Token(surf) omorfi.analyse(token) if not token.is_oov(): covered += 1 else: no_results += 1 print(1, "OOV", surf, sep='\t', file=options.outfile) found_anals = False found_lemma = False permuted = True accfail = False for anal in token.analyses: analhyp = anal.printable_unimorph() lemmahyp = ''.join(anal.get_lemmas()) if analhyp == unimorph: found_anals = True permuted = False elif set(analhyp.split(';')) == set(unimorph.split(';')): found_anals = True # print("PERMUTAHIT", analhyp, unimorph, sep='\t', # file=options.outfile) else: pass # print("ANALMISS", analhyp, unimorph, sep='\t', # file=options.outfile) if lemma == lemmahyp: found_lemma = True else: pass # print("LEMMAMISS", lemmahyp, lemma, sep='\t', # file=options.outfile) if not found_anals and not found_lemma: no_matches += 1 print("NOHITS!", surf, lemma, unimorph, [a.printable_unimorph() for a in token.analyses], sep='\t', file=options.outfile) elif found_anals and found_lemma: full_matches += 1 elif not found_anals: anal_matches += 1 print("LEMMANOANAL", surf, unimorph, [a.printable_unimorph() for a in token.analyses], sep='\t', file=options.outfile) elif not found_lemma: lemma_matches += 1 print("ANALNOLEMMA", surf, lemma, [a.get_lemmas() for a in token.analyses], sep='\t', file=options.outfile) else: print("Logical error, kill everyone") exit(13) if permuted: only_permuted += 1 if accfail: accfails += 1 realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) print("Lines", "Covered", "OOV", sep="\t", file=options.statfile) print(lines, covered, lines - covered, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, covered / lines * 100 if lines != 0 else 0, (lines - covered) / lines * 100 if lines != 0 else 0, sep="\t", file=options.statfile) print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, full_matches / lines * 100 if lines != 0 else 0, lemma_matches / lines * 100 if lines != 0 else 0, anal_matches / lines * 100 if lines != 0 else 0, no_matches / lines * 100 if lines != 0 else 0, no_results / lines * 100 if lines != 0 else 0, sep="% \t", file=options.statfile) print("Of which", "Tag permuations", sep='\t', file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, only_permuted / lines * 100 if lines != 0 else 0, sep='\t', file=options.statfile) if full_matches / lines * 100 <= int(options.threshold): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) elif covered / lines * 100 <= int(options.threshold): print("needs to have", threshold, "% coverage to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): print("""Please note that the licence of FTC does not allow you to do much with the results or anything, other than including approximate numbers of recall for scientific purposes. Please do not look at the differences or do any processing with any of the data since it will automatically make your versions of all your future work on this or any other analysers of the Finnish language illegal and other bad things.""", file=stderr) password = "******" userpass = input("Write '%s': " % (password)) if userpass != password: print( "You have chosen not to use badly licenced FTC data", file=stderr) exit(2) a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSADIR', required=True, help="Location of omorfi automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", required=True, type=FileType('w'), dest="outfile", help="result file") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") options = a.parse_args() omorfi = Omorfi() omorfi.load_from_dir(options.fsa) if not options.statfile: options.statfile = stdout # basic statistics full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # for make check target threshold = 0 for line in options.infile: if '<w lemma' not in line or 'msd=' not in line: continue matches = re.search('<w.*lemma="([^"]*).*msd="([^"]*)".*>([^<]*)</w>', line) if not matches: print("ERROR: Skipping line", line, file=stderr) continue lines += 1 if lines % 100000 == 0: print(lines, "...", file=stderr) ftcsurf = matches.group(3) ftclemma = matches.group(1) ftcanals = matches.group(2) omors = omorfi.analyse(ftcsurf) anals = [] for omor in omors: anals.append(convert_omor_string(omor.output, 'ftc')) found_anals = False found_lemma = False print_in = True for anal in anals: if ftcanals in anal: found_anals = True if ftclemma in anal: found_lemma = True if len(anals) == 0: print_in = False no_results += 1 print("NORESULTS:", ftcsurf, ftclemma, ftcanals, sep="\t", file=options.outfile) elif not found_anals and not found_lemma: no_matches += 1 print("NOMATCH:", ftcsurf, ftclemma, ftcanals, sep="\t", end="\t", file=options.outfile) elif not found_anals: lemma_matches += 1 print("NOANALMATCH:", ftcsurf, ftcanals, sep="\t", end="\t", file=options.outfile) elif not found_lemma: anal_matches += 1 print("NOLEMMAMATCH:", ftcsurf, ftclemma, sep="\t", end="\t", file=options.outfile) else: full_matches += 1 print_in = False if print_in: print(":IN:", end="\t", file=options.outfile) for anal in anals: print(anal, end='\t', file=options.outfile) print(file=options.outfile) print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100, full_matches / lines * 100, lemma_matches / lines * 100, anal_matches / lines * 100, no_matches / lines * 100, no_results / lines * 100, sep="\t", file=options.statfile) if (full_matches / lines * 100 < threshold): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', required=True, help="read analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--oracle', action='store_true', help="match to values in input when parsing if possible") a.add_argument('-u', '--udpipe', metavar="UDPIPE", help='use UDPIPE for additional guesses (experi-mental)') a.add_argument('--hacks', metavar='HACKS', help="mangle analyses to match HACKS version of UD", choices=['ftb']) a.add_argument('-X', '--frequencies', metavar="FREQDIR", help="read frequencies from FREQDIR/*.freqs") a.add_argument('--not-rules', metavar="RULEFILE", type=open, required=True, help="read non-rules from RULEFILE") a.add_argument('--debug', action='store_true', help="print lots of debug info while processing") options = a.parse_args() if options.verbose: print("Printing verbosely") omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed to conllu", file=stderr) exit(4) disamparsulator = Disamparsulator() if options.not_rules: if options.verbose: print("Loading", options.not_rules) disamparsulator.frobblesnizz(options.not_rules) if options.udpipe: if options.verbose: print("Loading udpipe", options.udpipe) omorfi.load_udpipe(options.udpipe) if not options.infile: print("reading from <stdin>") options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout if options.frequencies: with open(options.frequencies + '/lexemes.freqs') as lexfile: omorfi.load_lexical_frequencies(lexfile) with open(options.frequencies + '/omors.freqs') as omorfile: omorfi.load_omortag_frequencies(omorfile) # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 sentences = 0 eoffed = False while not eoffed: sentplus = next_conllu(options.infile) if not sentplus: eoffed = True break for token in sentplus: if token.nontoken: if token.nontoken == 'comment': pass elif token.nontoken == 'eof': eoffed = True break elif token.nontoken == 'separator': sentences += 1 elif token.nontoken == 'error': print("Unrecognisable line:", token.error, file=stderr) exit(1) else: print("Error:", token, file=stderr) exit(1) continue elif not token.surf: print("No surface in CONLL-U?", token, file=stderr) exit(1) tokens += 1 omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) disamparsulator.linguisticate(sentplus) print_analyses(sentplus, options) cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile) print("Unknowns / OOV:", unknowns, "=", unknowns / tokens * 100 if tokens != 0 else 0, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) print("Sentences per timeunit:", sentences / (realend - realstart), file=options.statfile) exit(0)
def main(): global total_token_count a = ArgumentParser() a.add_argument( '-f', '--fsa', metavar='FSAFILE', required=True, help= "HFST's optimised lookup binary data for the transducer to be applied") a.add_argument('-i', '--input', metavar="INFILE", type=str, required=True, dest="infile", help="source of analysis data") a.add_argument('-m', '--master', metavar="TSVFILE", type=str, required=True, dest="tsvfile", help="source of existing lexical data") opts = a.parse_args() if opts.infile: test_corpora_files = glob(opts.infile) else: test_corpora_files = glob("*.text") # hard-coded logs for now #lemma_log = open('missing_word_ids.log', 'w') #case_log = open('missing_nominal_cases.log', 'w') #comp_log = open('missing_comparatives.log', 'w') #adposition_log = open('adposition_complements.log', 'w') #adposition_stats = open('adposition_complements_full.log', 'w') #adjective_log = open('adjective_agreements.log', 'w') proper_stats = open('proper_contexts_full.log', 'w') lemma_stats = open('lemmas.freqs', 'w') #open('../src/probabilistics/lemmas.freqs', 'w') #case_stats = open('../src/probabilistics/cases.freqs', 'w') omorfi = Omorfi() omorfi.load_filename(opts.fsa) gather_lemmas(open(opts.tsvfile)) test_corpora = list() for test_corpus_file in test_corpora_files: try: test_corpora.append(open(test_corpus_file)) except IOError as ioe: print("Failed to open corpus ", test_corpus_file, ":", ioe) for test_corpus in test_corpora: print('lines from', test_corpus) linen = 0 for line in test_corpus: linen += 1 if (linen % 500000) == 0: print( linen, "...! Time to reload everything because memory is leaking very badly indeed!" ) previous = list() sent = list() omorfi = None omorfi = Omorfi() omorfi.load_filename(opts.fsa) gc.collect() if (linen % 1000) == 0: print(linen, "...", end='\r') for punct in ".,:;?!()": line = line.replace(punct, " " + punct + " ") for token in line.split(): total_token_count += 1 analyses = omorfi.analyse(token) add_to_sent(analyses, token) stat_word_ids(token, analyses) #stat_nominal_cases(token, analyses, case_log) #stat_adjective_comps(token, analyses, comp_log) print("Testing statistics") #test_zero_lemmas(lemma_log) #test_zero_cases(case_log) #test_zero_comps(comp_log) #test_case_deviations() #test_adposition_complements(adposition_log) #test_adjective_agreements(adjective_log) print("Writing accurate statistics") #print_adposition_stats(adposition_stats) print_proper_stats(proper_stats) print_lemma_stats(lemma_stats) #print_case_stats(case_stats) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-F', '--format', metavar="INFORMAT", default='text', help="read input using INFORMAT tokenisation", choices=['text', 'vislcg', 'conllu', 'sentences']) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('--not-rules', metavar="RULEFILE", type=open, help="read non-rules from RULEFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is required to vislcg", file=stderr) exit(4) disamparsulator = None if options.not_rules: if options.verbose: print("Reading rulestuff", options.not_rules.name) disamparsulator = Disamparsulator() disamparsulator.frobblesnizz(options.not_rules) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: if options.outfile == stdout: options.statfile = stdout else: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokencount = 0 unknowns = 0 eoffed = False while not eoffed: if options.format == 'vislcg': tokens = next_vislcg(options.infile) elif options.format == 'text': tokens = next_plaintext(options.infile) elif options.format == 'conllu': tokens = next_conllu(options.infile) else: print("input format missing implementation", options.format, file=stderr) exit(2) if not tokens: break for token in tokens: if token.surf: tokencount += 1 omorfi.analyse(token) if token.is_oov(): unknowns += 1 omorfi.guess(token) elif token.error or token.nontoken: pass else: print("Unrecognised", token, file=stderr) exit(2) if disamparsulator: disamparsulator.linguisticate(tokens) for token in tokens: if token.nontoken and token.nontoken == "eof": eoffed = True break print(token.printable_vislcg(), file=options.outfile) cpuend = process_time() realend = perf_counter() print("# Tokens:", tokencount, "\n# Unknown:", unknowns, unknowns / tokencount * 100 if tokencount > 0 else 0, "%", file=options.statfile) print("# CPU time:", cpuend - cpustart, "\n# Real time:", realend - realstart, file=options.statfile) print("# Tokens per timeunit:", tokencount / (realend - realstart), file=options.statfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter from SFILE", required=True) a.add_argument('-S', '--labeller', metavar='LSFILE', help="load labelsegmenter from LSFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", required=True, choices=["moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument('--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries " + "(prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") a.add_argument('--show-ambiguous', default=False, metavar='ASEP', help="separate ambiguous segmentations with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.segmenter: if options.verbose: print("Reading segmenter", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("segmenter is needed for segmenting", file=stderr) exit(2) if options.labeller: if options.verbose: print("Reading labelsegmenter", options.labeller) omorfi.load_labelsegmenter(options.labeller) if not omorfi.can_segment or not omorfi.can_labelsegment: print("Could not load segmenter(s), re-compile them or use -f option") print() print("To compile segmenter, use --enable-segmenter, and/or", "--enable-labeled-segments") exit(1) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: omorfi.segment(token) omorfi.labelsegment(token) if options.output_format == 'moses-factors': print_moses_factor_segments(token, outfile, options) elif options.output_format == 'segments': print_segments(token, outfile, options) print(file=outfile) exit(0)
def main(): """Command-line interface for omorfi's sort | uniq -c tester.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-g', '--generator', metavar='FSAFILE', required=True, help="load analyser from FSAFILE") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", type=FileType('w'), dest="outfile", help="log outputs to OUTFILE") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") a.add_argument('-v', '--verbose', action="store_true", default=False, help="Print verbosely while processing") a.add_argument('-C', '--no-casing', action="store_true", default=False, help="Do not try to recase input and output when matching") a.add_argument('-t', '--threshold', metavar="THOLD", default=99, help="if coverage is less than THOLD exit with error") a.add_argument('-F', '--format', metavar="FMT", required=True, help="which SIGMORHON shared task format is used") options = a.parse_args() omorfi = Omorfi(options.verbose) try: if options.analyser: if options.verbose: print("reading analyser from", options.analyser) omorfi.load_analyser(options.analyser) if options.generator: if options.verbose: print("reading generator from", options.generator) omorfi.load_generator(options.generator) if not options.infile: options.infile = stdin print("reading from <stdin>") if not options.statfile: options.statfile = stdout if not options.outfile: options.outfile = stdout except IOError: print("Could not process file", options.analyser, file=stderr) exit(2) # basic statistics correct = 0 incorrect = 0 oov = 0 lines = 0 # for make check target realstart = perf_counter() cpustart = process_time() for line in options.infile: fields = line.strip().split('\t') if len(fields) < 3: print("ERROR: Skipping line", fields, file=stderr) continue omors = None lemma = None print("<<<", fields) if options.format == '1': lemma = fields[0] omors = unimorph2omor(fields[1]) elif options.format == '2': srcomors = unimorph2omor(fields[0]) srchyps = omorfi.analyse(fields[1]) for srchyp in srchyps: if srcomors in srchyp.raw and len(srchyp.get_lemmas()) == 1: lemma = srchyp.get_lemmas()[0] if not lemma: lemma = ''.join(srchyps[0].get_lemmas()) omors = unimorph2omor(fields[2]) elif options.format == '3': srchyps = omorfi.analyse(fields[0]) for srchyp in srchyps: if len(srchyp.get_lemmas()) == 1: lemma = srchyp.get_lemmas()[0] if not lemma: lemma = ''.join(srchyps[0].get_lemmas()) omors = unimorph2omor(fields[1]) else: print("format fail", options.format) exit(1) genomor = '[WORD_ID=' + lemma + ']' + omors print(">>> ", genomor) generations = omorfi.generate(genomor) if not generations or '[' in generations: oov += 1 genat1 = lemma print("OOV", genat1) else: genat1 = generations.split('/')[0] print("@1 ", genat1) if options.format == '1': if genat1 == fields[2]: correct += 1 else: print("MIS", genat1, "!=", fields[2]) incorrect += 1 elif options.format == '2': if genat1 == fields[3]: correct += 1 else: print("MIS", genat1, "!=", fields[2]) incorrect += 1 elif options.format == '3': if genat1 == fields[2]: correct += 1 else: print("MIS", genat1, "!=", fields[2]) incorrect += 1 lines += 1 if options.verbose and lines % 1000 == 0: print(lines, '...') realend = perf_counter() cpuend = process_time() print("CPU time:", cpuend - cpustart, "real time:", realend - realstart) if lines == 0: print("Needs more than 0 lines to determine something", file=stderr) exit(2) print("Lines", "Corect", "OOV", sep="\t", file=options.statfile) print(lines, correct, oov, sep="\t", file=options.statfile) print(lines / lines * 100 if lines != 0 else 0, correct / lines * 100 if lines != 0 else 0, oov / lines * 100, sep="\t", file=options.statfile) exit(0)
def main(): print("""Please note that the licence of FTC does not allow you to do much with the results or anything, other than including approximate numbers of recall for scientific purposes. Please do not look at the differences or do any processing with any of the data since it will automatically make your versions of all your future work on this or any other analysers of the Finnish language illegal and other bad things.""", file=stderr) password = "******" userpass = input("Write '%s': " % (password)) if userpass != password: print("You have chosen not to use badly licenced FTC data", file=stderr) exit(2) a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSADIR', required=True, help="Location of omorfi automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, required=True, dest="infile", help="source of analysis data") a.add_argument('-o', '--output', metavar="OUTFILE", required=True, type=FileType('w'), dest="outfile", help="result file") a.add_argument('-X', '--statistics', metavar="STATFILE", type=FileType('w'), dest="statfile", help="statistics") options = a.parse_args() omorfi = Omorfi() omorfi.load_from_dir(options.fsa) if not options.statfile: options.statfile = stdout # basic statistics full_matches = 0 lemma_matches = 0 anal_matches = 0 no_matches = 0 no_results = 0 lines = 0 # for make check target threshold = 0 for line in options.infile: if '<w lemma' not in line or 'msd=' not in line: continue matches = re.search('<w.*lemma="([^"]*).*msd="([^"]*)".*>([^<]*)</w>', line) if not matches: print("ERROR: Skipping line", line, file=stderr) continue lines += 1 if lines % 100000 == 0: print(lines, "...", file=stderr) ftcsurf = matches.group(3) ftclemma = matches.group(1) ftcanals = matches.group(2) omors = omorfi.analyse(ftcsurf) anals = [] for omor in omors: anals.append(convert_omor_string(omor.output, 'ftc')) found_anals = False found_lemma = False print_in = True for anal in anals: if ftcanals in anal: found_anals = True if ftclemma in anal: found_lemma = True if len(anals) == 0: print_in = False no_results += 1 print("NORESULTS:", ftcsurf, ftclemma, ftcanals, sep="\t", file=options.outfile) elif not found_anals and not found_lemma: no_matches += 1 print("NOMATCH:", ftcsurf, ftclemma, ftcanals, sep="\t", end="\t", file=options.outfile) elif not found_anals: lemma_matches += 1 print("NOANALMATCH:", ftcsurf, ftcanals, sep="\t", end="\t", file=options.outfile) elif not found_lemma: anal_matches += 1 print("NOLEMMAMATCH:", ftcsurf, ftclemma, sep="\t", end="\t", file=options.outfile) else: full_matches += 1 print_in = False if print_in: print(":IN:", end="\t", file=options.outfile) for anal in anals: print(anal, end='\t', file=options.outfile) print(file=options.outfile) print("Lines", "Matches", "Lemma", "Anals", "Mismatch", "No results", sep="\t", file=options.statfile) print(lines, full_matches, lemma_matches, anal_matches, no_matches, no_results, sep="\t", file=options.statfile) print(lines / lines * 100, full_matches / lines * 100, lemma_matches / lines * 100, anal_matches / lines * 100, no_matches / lines * 100, no_results / lines * 100, sep="\t", file=options.statfile) if (full_matches / lines * 100 < threshold): print("needs to have", threshold, "% matches to pass regress test\n", "please examine", options.outfile.name, "for regressions", file=stderr) exit(1) else: exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load tokeniser model from (analyser) AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu', 'json', 'ftb3']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading language model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed for tokenisation", file=stderr) exit(1) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stderr # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# new doc id=", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf.surf for surf in surfs]), file=options.outfile) elif options.output_format == 'json': print(json.encode(surfs)) elif options.output_format == 'conllu': print("# sent_id =", lines, file=options.outfile) print("# text =", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 elif options.output_format == 'ftb3': print("<s><loc file=\"", options.infile.name, "\" line=\"", lines, "\" />", file=options.outfile, sep="") i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 print("</s>", file=options.outfile) if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)