Python Omorfi.tokenise 예제들, omorfi.omorfi.Omorfi.tokenise Python 예제들

예제 #1

0

파일 보기

파일: omorfi-vislcg.py 프로젝트: gartenfeld/omorfi

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    for line in options.infile:
        line = line
        if not line or line == '':
            continue
        surfs = omorfi.tokenise(line)
        for surf in surfs:
            tokens += 1
            anals = omorfi.analyse(surf)
            print_analyses_vislcg3(surf, anals, options.outfile)
            if len(anals) == 0 or (len(anals) == 1 and
                                   'UNKNOWN' in anals[0][0]):
                unknowns += 1
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100,
          "%", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          file=options.statfile)
    exit(0)

예제 #2

0

파일 보기

파일: omorfi-vislcg.py 프로젝트: Traubert/omorfi

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    for line in options.infile:
        line = line
        if not line or line == '':
            continue
        surfs = omorfi.tokenise(line)
        for surf in surfs:
            tokens += 1
            anals = omorfi.analyse(surf)
            print_analyses_vislcg3(surf, anals, options.outfile)
            if len(anals) == 0 or (len(anals) == 1 and
                                   'UNKNOWN' in anals[0][0]):
                unknowns += 1
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Unknown:", unknowns, unknowns / tokens * 100,
          "%", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          file=options.statfile)
    exit(0)

예제 #3

0

파일 보기

    def stream(text):
        om = Omorfi()
        om.load_from_dir('/usr/local/share/omorfi/', analyse=True)
        for token in om.tokenise(text):
            yield "%s\n" % token[0]
            for analyse_res in om.analyse(token):
                text, weight = analyse_res[:2]
                if len(analyse_res) > 2:
                    rest = " ".join([str(x) for x in analyse_res[2:]])
                else:
                    rest = ''

                yield "%s %s %s\n" % (text, weight, rest)

            yield "\n"

예제 #4

0

파일 보기

파일: omorfi-tokenise.py 프로젝트: thintsa/omorfi

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--output-format', metavar="OUTFORMAT",
                   default="moses",
                   help="format output for OUTFORMAT", choices=['moses', 'conllu'])
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, accept=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    lines = 0
    if options.output_format == 'conllu':
        print("# doc-name:", options.infile.name, file=options.outfile)
    for line in options.infile:
        line = line
        lines += 1
        if options.verbose and lines % 10000 == 0:
            print(lines, "...")
        if not line or line.rstrip('\n') == '':
            continue
        surfs = omorfi.tokenise(line)
        tokens += len(surfs)
        if options.output_format == 'moses':
            print(' '.join([surf[0] for surf in surfs]), file=options.outfile)
        else:
            print("# sentence-text:", line.rstrip("\n"), file=options.outfile)
            i = 1
            for surf in surfs:
                print(i, surf[0], "_", "_", "_", "_", "_", "_", "_",
                      surf[1],
                      sep="\t", file=options.outfile)
                i += 1
        if options.output_format == 'conllu':
            print(file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines,
          "tokens/line", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          "Lines per timeunit:", lines / (realend - realstart),
          file=options.statfile)
    exit(0)

예제 #5

0

파일 보기

def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-s',
                   '--segmenter',
                   metavar='SFILE',
                   help="load segmenter from SFILE",
                   required=True)
    a.add_argument('-S',
                   '--labeller',
                   metavar='LSFILE',
                   help="load labelsegmenter from LSFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O',
                   '--output-format',
                   metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   required=True,
                   choices=["moses-factors", "segments"])
    a.add_argument('--no-split-words',
                   action="store_false",
                   default=True,
                   dest="split_words",
                   help="split on word boundaries")
    a.add_argument(
        '--no-split-new-words',
        action="store_false",
        default=True,
        dest="split_new_words",
        help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--no-split-morphs',
                   action="store_false",
                   default=True,
                   dest="split_morphs",
                   help="split on morph boundaries")
    a.add_argument('--split-derivs',
                   action="store_true",
                   default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords',
                   action="store_true",
                   default=False,
                   help="split on other boundaries")
    a.add_argument('--segment-marker',
                   default='→ ←',
                   metavar='SEG',
                   help="mark segment boundaries with SEG")
    a.add_argument('--show-ambiguous',
                   default=False,
                   metavar='ASEP',
                   help="separate ambiguous segmentations with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.segmenter:
        if options.verbose:
            print("Reading segmenter", options.segmenter)
        omorfi.load_segmenter(options.segmenter)
    else:
        print("segmenter is needed for segmenting", file=stderr)
        exit(2)
    if options.labeller:
        if options.verbose:
            print("Reading labelsegmenter", options.labeller)
        omorfi.load_labelsegmenter(options.labeller)
    if not omorfi.can_segment:
        print("Could not load segmenter(s), re-compile them or use -f option")
        print()
        print("To compile segmenter, use --enable-segmenter, and/or",
              "--enable-labeled-segments")
        exit(1)
    if options.infile:
        infile = options.infile
    else:
        options.infile = stdin
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        options.output = "<stdout>"
        outfile = stdout
    if options.segment_marker is None:
        if options.verbose:
            print("Default segment marker is → ←")
        options.segment_marker = '→ ←'
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            print(file=outfile)
            continue
        tokens = omorfi.tokenise(line)
        for token in tokens:
            segments = omorfi.segment(token)
            labelsegments = omorfi.labelsegment(token)
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(segments, labelsegments, token,
                                            outfile, options)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, token, outfile,
                               options)
        print(file=outfile)
    exit(0)

예제 #6

0

파일 보기

파일: omorfi-segment.py 프로젝트: gartenfeld/omorfi

def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-f', '--fsa', metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O', '--output-format', metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   choices=["labels-tsv", "moses-factors", "segments"])
    a.add_argument('--no-split-words', action="store_false", default=True,
                   dest="split_words",
                   help="split on word boundaries")
    a.add_argument('--no-split-new-words', action="store_false", default=True,
                   dest="split_new_words",
                   help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--no-split-morphs', action="store_false", default=True,
                   dest="split_morphs",
                   help="split on morph boundaries")
    a.add_argument('--split-derivs', action="store_true", default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords', action="store_true", default=False,
                   help="split on other boundaries")
    a.add_argument('--segment-marker', default='→ ←', metavar='SEG',
                   help="mark segment boundaries with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("Reading automata dir", options.fsa)
        omorfi.load_from_dir(options.fsa, segment=True,
                             labelsegment=True)
    else:
        if options.verbose:
            print("Searching for automata everywhere...")
        omorfi.load_from_dir(labelsegment=True, segment=True)
    if options.infile:
        infile = options.infile
    else:
        options.infile = stdin
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        options.output = "<stdout>"
        outfile = stdout
    if options.segment_marker is None:
        if options.verbose:
            print("Default segment marker is → ←")
        options.segment_marker = '→ ←'
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            print(file=outfile)
            continue
        tokens = omorfi.tokenise(line)
        for token in tokens:
            segments = omorfi.segment(token[0])
            labelsegments = omorfi.labelsegment(token[0])
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(
                    segments, labelsegments, token[0], outfile, options)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, token[0], outfile,
                               options)
        print(file=outfile)
    exit(0)

예제 #7

0

파일 보기

파일: omorfi-tokenise.py 프로젝트: akaihola/omorfi

def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a', '--analyser', metavar='AFILE',
                   help="load tokeniser model from (analyser) AFILE",
                   required=True)
    a.add_argument('-i', '--input', metavar="INFILE", type=open,
                   dest="infile", help="source of analysis data")
    a.add_argument('-v', '--verbose', action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o', '--output', metavar="OUTFILE", dest="outfile",
                   help="print output into OUTFILE", type=FileType('w'))
    a.add_argument('-x', '--statistics', metavar="STATFILE", dest="statfile",
                   help="print statistics to STATFILE", type=FileType('w'))
    a.add_argument('-O', '--output-format', metavar="OUTFORMAT",
                   default="moses",
                   help="format output for OUTFORMAT", choices=['moses',
                       'conllu', 'json', 'ftb3'])
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading language model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed for tokenisation", file=stderr)
        exit(1)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    lines = 0
    if options.output_format == 'conllu':
        print("# new doc id=", options.infile.name, file=options.outfile)
    for line in options.infile:
        line = line
        lines += 1
        if options.verbose and lines % 10000 == 0:
            print(lines, "...")
        if not line or line.rstrip('\n') == '':
            continue
        surfs = omorfi.tokenise(line)
        tokens += len(surfs)
        if options.output_format == 'moses':
            print(' '.join([surf['surf'] for surf in surfs]), file=options.outfile)
        elif options.output_format == 'json':
            print(json.encode(surfs))
        elif options.output_format == 'conllu':
            print("# sent_id =", lines, file=options.outfile)
            print("# text =", line.rstrip("\n"), file=options.outfile)
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_",
                      format_misc_ud(surf),
                      sep="\t", file=options.outfile)
                i += 1
        elif options.output_format == 'ftb3':
            print("<s><loc file=\"", options.infile.name, "\" line=\"",
                    lines, "\" />", file=options.outfile, sep="")
            i = 1
            for surf in surfs:
                print(i, surf['surf'], "_", "_", "_", "_", "_", "_", "_", "_",
                        sep="\t", file=options.outfile)
                i += 1
            print("</s>", file=options.outfile)
        if options.output_format == 'conllu':
            print(file=options.outfile)
    cpuend = process_time()
    realend = perf_counter()
    print("Lines:", lines, "Tokens:", tokens, "Ratio:", tokens / lines,
          "tokens/line", file=options.statfile)
    print("CPU time:", cpuend - cpustart, "Real time:", realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:", tokens / (realend - realstart),
          "Lines per timeunit:", lines / (realend - realstart),
          file=options.statfile)
    exit(0)

예제 #8

0

파일 보기

파일: omorfi-segment.py 프로젝트: frankier/omorfi

def main():
    """Segment text in some formats."""
    a = ArgumentParser()
    a.add_argument('-f',
                   '--fsa',
                   metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   help="print segments into OUTFILE")
    a.add_argument('-O',
                   '--output-format',
                   metavar="OFORMAT",
                   help="format output suitable for OFORMAT",
                   choices=["labels-tsv", "moses-factors", "segments"])
    a.add_argument('--split-words',
                   action="store_true",
                   default=True,
                   help="split on word boundaries")
    a.add_argument(
        '--split-new-words',
        action="store_true",
        default=True,
        help="split on new word boundaries (prev. unattested compounds)")
    a.add_argument('--split-morphs',
                   action="store_true",
                   default=True,
                   help="split on morph boundaries")
    a.add_argument('--split-derivs',
                   action="store_true",
                   default=False,
                   help="split on derivation boundaries")
    a.add_argument('--split-nonwords',
                   action="store_true",
                   default=True,
                   help="split on other boundaries")
    a.add_argument('--segment-marker',
                   default=' ',
                   metavar='SEG',
                   help="mark segment boundaries with SEG")
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("Reading automata dir", options.fsa)
        omorfi.load_from_dir(options.fsa, segment=True, labelsegment=True)
    else:
        if options.verbose:
            print("Searching for automata everywhere...")
        omorfi.load_from_dir(labelsegment=True, segment=True)
    if options.infile:
        infile = options.infile
    else:
        infile = stdin
    if options.output:
        outfile = open(options.output, 'w')
    else:
        outfile = stdout
    if options.verbose:
        print("reading from", options.infile.name)
    if options.verbose:
        print("writign to", options.output)

    linen = 0
    for line in infile:
        line = line.strip()
        linen += 1
        if options.verbose and linen % 10000 == 0:
            print(linen, '...')
        if not line or line == '':
            continue
        surfs = omorfi.tokenise(line)
        for surf in surfs:
            segments = omorfi.segment(surf)
            labelsegments = omorfi.labelsegment(surf)
            if options.output_format == 'moses-factors':
                print_moses_factor_segments(segments, labelsegments, surf,
                                            outfile)
            elif options.output_format == 'segments':
                print_segments(segments, labelsegments, surf, outfile, options)
        print(file=outfile)
    exit(0)

예제 #9

0

파일 보기

 def stream(text):
     om = Omorfi()
     om.load_from_dir('/usr/local/share/omorfi/', lemmatise=True)
     for token in om.tokenise(text):
         yield " ".join(map(lambda x: str(x), om.lemmatise(token[0])))