def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter from SFILE", required=True) a.add_argument('-S', '--labeller', metavar='LSFILE', help="load labelsegmenter from LSFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", required=True, choices=["moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument( '--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") a.add_argument('--show-ambiguous', default=False, metavar='ASEP', help="separate ambiguous segmentations with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.segmenter: if options.verbose: print("Reading segmenter", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("segmenter is needed for segmenting", file=stderr) exit(2) if options.labeller: if options.verbose: print("Reading labelsegmenter", options.labeller) omorfi.load_labelsegmenter(options.labeller) if not omorfi.can_segment: print("Could not load segmenter(s), re-compile them or use -f option") print() print("To compile segmenter, use --enable-segmenter, and/or", "--enable-labeled-segments") exit(1) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token) labelsegments = omorfi.labelsegment(token) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, token, outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token, outfile, options) print(file=outfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--no-split-words', action="store_false", default=True, dest="split_words", help="split on word boundaries") a.add_argument('--no-split-new-words', action="store_false", default=True, dest="split_new_words", help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--no-split-morphs', action="store_false", default=True, dest="split_morphs", help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=False, help="split on other boundaries") a.add_argument('--segment-marker', default='→ ←', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: options.infile = stdin infile = stdin if options.output: outfile = open(options.output, 'w') else: options.output = "<stdout>" outfile = stdout if options.segment_marker is None: if options.verbose: print("Default segment marker is → ←") options.segment_marker = '→ ←' if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': print(file=outfile) continue tokens = omorfi.tokenise(line) for token in tokens: segments = omorfi.segment(token[0]) labelsegments = omorfi.labelsegment(token[0]) if options.output_format == 'moses-factors': print_moses_factor_segments( segments, labelsegments, token[0], outfile, options) elif options.output_format == 'segments': print_segments(segments, labelsegments, token[0], outfile, options) print(file=outfile) exit(0)
def main(): """Segment text in some formats.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print segments into OUTFILE") a.add_argument('-O', '--output-format', metavar="OFORMAT", help="format output suitable for OFORMAT", choices=["labels-tsv", "moses-factors", "segments"]) a.add_argument('--split-words', action="store_true", default=True, help="split on word boundaries") a.add_argument( '--split-new-words', action="store_true", default=True, help="split on new word boundaries (prev. unattested compounds)") a.add_argument('--split-morphs', action="store_true", default=True, help="split on morph boundaries") a.add_argument('--split-derivs', action="store_true", default=False, help="split on derivation boundaries") a.add_argument('--split-nonwords', action="store_true", default=True, help="split on other boundaries") a.add_argument('--segment-marker', default=' ', metavar='SEG', help="mark segment boundaries with SEG") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir(labelsegment=True, segment=True) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = omorfi.tokenise(line) for surf in surfs: segments = omorfi.segment(surf) labelsegments = omorfi.labelsegment(surf) if options.output_format == 'moses-factors': print_moses_factor_segments(segments, labelsegments, surf, outfile) elif options.output_format == 'segments': print_segments(segments, labelsegments, surf, outfile, options) print(file=outfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-f', '--fsa', metavar='FSAPATH', help="Path to directory of HFST format automata") a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.fsa: if options.verbose: print("Reading automata dir", options.fsa) omorfi.load_from_dir(options.fsa) else: if options.verbose: print("Searching for automata everywhere...") omorfi.load_from_dir() if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", options.infile.name) if options.verbose: print("writign to", options.output) re_lemma = re.compile("\[WORD_ID=([^]]*)\]") re_pos = re.compile("\[POS=([^]]*)\]") re_mrd = re.compile("\[([^=]*)=([^]]*)]") linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = line.split() for surf in surfs: anals = omorfi.analyse(surf) segments = omorfi.segment(surf) pos_matches = re_pos.finditer(anals[0][0]) pos = "UNK" mrds = [] lemmas = [] for pm in pos_matches: pos = pm.group(1) lemma_matches = re_lemma.finditer(anals[0][0]) for lm in lemma_matches: lemmas += [lm.group(1)] mrd_matches = re_mrd.finditer(anals[0][0]) for mm in mrd_matches: if mm.group(1) == 'WORD_ID': mrds = [] elif mm.group(1) == 'WEIGHT': pass else: mrds += [mm.group(2)] stemfixes = segments[0][0][ segments[0][0].rfind("{STUB}"):].replace("{STUB}", "") if '{' in stemfixes: morphs = stemfixes[stemfixes.find("{"):].replace("{MB}", ".") else: morphs = '0' print(surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE", required=True) a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter model from SFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("Reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("at least analyser file is needed", file=stderr) exit(1) if options.segmenter: if options.verbose: print("Reading segmenter model", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("at least segmenter file is needed", file=stderr) exit(1) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", infile.name) if options.verbose: print("writign to", outfile.name) re_lemma = re.compile("\[WORD_ID=([^]]*)\]") re_pos = re.compile("\[UPOS=([^]]*)\]") re_mrd = re.compile("\[([^=]*)=([^]]*)]") linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue surfs = line.split() for surf in surfs: anals = omorfi.analyse(surf) segments = omorfi.segment(surf) pos_matches = re_pos.finditer(anals[0]['anal']) pos = "UNK" mrds = [] lemmas = [] for pm in pos_matches: pos = pm.group(1) lemma_matches = re_lemma.finditer(anals[0]['anal']) for lm in lemma_matches: lemmas += [lm.group(1)] mrd_matches = re_mrd.finditer(anals[0]['anal']) for mm in mrd_matches: if mm.group(1) == 'WORD_ID': mrds = [] elif mm.group(1) == 'WEIGHT': pass else: mrds += [mm.group(2)] parts = segments[0]['segments'] if '{DB}' in parts: suffixes = parts[parts.rfind('{DB}') + 4:] elif '{WB}' in parts: suffixes = parts[parts.rfind('{WB}') + 4:] elif '{hyph?}' in parts: suffixes = parts[parts.rfind('{hyph?}') + 6:] else: suffixes = "0" morphs = suffixes[suffixes.find("{"):].replace("{MB}", ".") print(surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)
def main(): """Preprocess text for moses factored modeling.""" a = ArgumentParser() a.add_argument('-a', '--analyser', metavar='AFILE', help="load analyser model from AFILE", required=True) a.add_argument('-s', '--segmenter', metavar='SFILE', help="load segmenter model from SFILE", required=True) a.add_argument('-i', '--input', metavar="INFILE", type=open, dest="infile", help="source of analysis data") a.add_argument('-v', '--verbose', action='store_true', help="print verbosely while processing") a.add_argument('-o', '--output', metavar="OUTFILE", help="print factors into OUTFILE") options = a.parse_args() omorfi = Omorfi(options.verbose) if options.analyser: if options.verbose: print("Reading analyser model", options.analyser) omorfi.load_analyser(options.analyser) else: print("at least analyser file is needed", file=stderr) exit(1) if options.segmenter: if options.verbose: print("Reading segmenter model", options.segmenter) omorfi.load_segmenter(options.segmenter) else: print("at least segmenter file is needed", file=stderr) exit(1) if options.infile: infile = options.infile else: infile = stdin if options.output: outfile = open(options.output, 'w') else: outfile = stdout if options.verbose: print("reading from", infile.name) if options.verbose: print("writign to", outfile.name) linen = 0 for line in infile: line = line.strip() linen += 1 if options.verbose and linen % 10000 == 0: print(linen, '...') if not line or line == '': continue tokens = omorfi.tokenise_sentence(line) for token in tokens: if not token.surf: continue anals = omorfi.analyse(token) pos = "X" mrds = ["?"] lemmas = [token.surf] if anals: anal = token.get_best() pos = anal.get_upos() mrds = anal.get_ufeats() lemmas = anal.get_lemmas() segments = omorfi.segment(token) morphs = "0" if segments: segment = token.get_best_segments() if segment: parts = segment.get_segments() morphs = ".".join(parts) else: morphs = token.surf print(token.surf, '+'.join(lemmas), pos, '.'.join(mrds), morphs, sep='|', end=' ', file=outfile) print(file=outfile) exit(0)