def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 for line in options.infile: line = line if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: tokens += 1 anals = omorfi.analyse(surf) print_analyses_vislcg3(surf, anals, options.outfile) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 unknowns = 0 for line in options.infile: line = line if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: tokens += 1 anals = omorfi.analyse(surf) print_analyses_vislcg3(surf, anals, options.outfile) if len(anals) == 0 or (len(anals) == 1 and 'UNKNOWN' in anals[0][0]): unknowns += 1 cpuend = process_time() realend = perf_counter() print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100, "%", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), file=options.statfile) exit(0)
def stream(text): om = Omorfi() om.load_from_dir('/usr/local/share/omorfi/', analyse=True) for token in om.tokenise(text): yield "%s\n" % token[0] for analyse_res in om.analyse(token): text, weight = analyse_res[:2] if len(analyse_res) > 2: rest = " ".join([str(x) for x in analyse_res[2:]]) else: rest = '' yield "%s %s %s\n" % (text, weight, rest) yield "\n"
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("reading language models in", options.fsa) omorfi.load_from_dir(options.fsa, analyse=True, accept=True) else: if options.verbose: print("reading language models in default dirs") omorfi.load_from_dir() if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# doc-name:", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf[0] for surf in surfs]), file=options.outfile) else: print("# sentence-text:", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf[0], "_", "_", "_", "_", "_", "_", "_", surf[1], sep="\t", file=options.outfile) i += 1 if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter from SFILE", required=True) a.add_argument('-S', '--labeller', metavar='LSFILE', help="load labelsegmenter from LSFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", required=True, choices=["moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument( '--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") a.add_argument('--show-ambiguous', default=False, metavar='ASEP', help="separate ambiguous segmentations with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.segmenter: if options.verbose: print("Reading segmenter", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("segmenter is needed for segmenting", file=stderr) exit(2) if options.labeller: if options.verbose: print("Reading labelsegmenter", options.labeller) omorfi.load_labelsegmenter(options.labeller) if not omorfi.can_segment: print("Could not load segmenter(s), re-compile them or use -f option") print() print("To compile segmenter, use --enable-segmenter, and/or", "--enable-labeled-segments") exit(1) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token) labelsegments = omorfi.labelsegment(token) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, token, outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token, outfile, options) print(file=outfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument('--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token[0]) labelsegments = omorfi.labelsegment(token[0]) if options.output_format == 'moses-factors': print_moses_factor_segments( segments, labelsegments, token[0], outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token[0], outfile, options) print(file=outfile) exit(0)
def main(): """Invoke a simple CLI analyser.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load tokeniser model from (analyser) AFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile", help="print output into OUTFILE", type=FileType('w')) a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile", help="print statistics to STATFILE", type=FileType('w')) a.add_argument('-O', '--output-format', metavar="OUTFORMAT", default="moses", help="format output for OUTFORMAT", choices=['moses', 'conllu', 'json', 'ftb3']) options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("reading language model", options.analyser) omorfi.load_analyser(options.analyser) else: print("analyser is needed for tokenisation", file=stderr) exit(1) if not options.infile: options.infile = stdin if options.verbose: print("analysing", options.infile.name) if not options.outfile: options.outfile = stdout if options.verbose: print("writing to", options.outfile.name) if not options.statfile: options.statfile = stdout # statistics realstart = perf_counter() cpustart = process_time() tokens = 0 lines = 0 if options.output_format == 'conllu': print("# new doc id=", options.infile.name, file=options.outfile) for line in options.infile: line = line lines += 1 if options.verbose and lines % 10000 == 0: print(lines, "...") if not line or line.rstrip('\n') == '': continue surfs = omorfi.tokenise(line) tokens += len(surfs) if options.output_format == 'moses': print(' '.join([surf['surf'] for surf in surfs]), file=options.outfile) elif options.output_format == 'json': print(json.encode(surfs)) elif options.output_format == 'conllu': print("# sent_id =", lines, file=options.outfile) print("# text =", line.rstrip("\n"), file=options.outfile) i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", format_misc_ud(surf), sep="\t", file=options.outfile) i += 1 elif options.output_format == 'ftb3': print("<s><loc file=\"", options.infile.name, "\" line=\"", lines, "\" />", file=options.outfile, sep="") i = 1 for surf in surfs: print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_", sep="\t", file=options.outfile) i += 1 print("</s>", file=options.outfile) if options.output_format == 'conllu': print(file=options.outfile) cpuend = process_time() realend = perf_counter() print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines, "tokens/line", file=options.statfile) print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart, file=options.statfile) print("Tokens per timeunit:", tokens / (realend - realstart), "Lines per timeunit:", lines / (realend - realstart), file=options.statfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--split-words', action="store_true", default=True, help="split on word boundaries") a.add_argument( '--split-new-words', action="store_true", default=True, help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--split-morphs', action="store_true", default=True, help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=True, help="split on other boundaries") a.add_argument('--segment-marker', default=' ', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: segments = omorfi.segment(surf) labelsegments = omorfi.labelsegment(surf) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, surf, outfile) elif options.output_format == 'segments': print_segments(segments, labelsegments, surf, outfile, options) print(file=outfile) exit(0)
def stream(text): om = Omorfi() om.load_from_dir('/usr/local/share/omorfi/', lemmatise=True) for token in om.tokenise(text): yield " ".join(map(lambda x: str(x), om.lemmatise(token[0])))