예제 #1
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="load analyser model from AFILE")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-F',
                   '--format',
                   metavar="INFORMAT",
                   default='text',
                   help="read input using INFORMAT tokenisation",
                   choices=['text', 'vislcg', 'conllu'])
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    options = a.parse_args()
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is required to vislcg", file=stderr)
        exit(4)
    if not options.infile:
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        if options.outfile == stdout:
            options.statfile = stdout
        else:
            options.statfile = stderr
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    last = None
    for line in options.infile:
        surfs = []
        if options.format == 'vislcg':
            surfs = get_line_tokens_vislcg(line, last)
        elif options.format == 'text':
            surfs = get_line_tokens(line, omorfi)
        elif options.format == 'conllu':
            surfs = get_line_tokens_conllu(line, last)
        else:
            print("input format missing implementation",
                  options.format,
                  file=stderr)
            exit(2)
        for surf in surfs:
            if 'conllu_form' in surf:
                # skip conllu special forms in input for now:
                # (ellipsis and MWE magics)
                continue
            elif 'surf' in surf:
                tokens += 1
                anals = omorfi.analyse(surf)
                if len(anals) == 0 or (len(anals) == 1
                                       and 'UNKNOWN' in anals[0]['anal']):
                    unknowns += 1
                    anals = omorfi.guess(surf)
                print_analyses_vislcg3(surf, anals, options.outfile)
            elif 'comment' in surf:
                if surf['comment'].startswith(';') or \
                       surf['comment'].startswith('\t'):
                    continue
                else:
                    print(surf['comment'], file=options.outfile)
            elif 'error' in surf:
                print(surf['error'], file=stderr)
                exit(2)
            last = surf
    cpuend = process_time()
    realend = perf_counter()
    print("# Tokens:",
          tokens,
          "\n# Unknown:",
          unknowns,
          unknowns / tokens * 100,
          "%",
          file=options.statfile)
    print("# CPU time:",
          cpuend - cpustart,
          "\n# Real time:",
          realend - realstart,
          file=options.statfile)
    print("# Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
예제 #2
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-f',
                   '--fsa',
                   metavar='FSAPATH',
                   help="Path to directory of HFST format automata")
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-u',
                   '--udpipe',
                   metavar="UDPIPE",
                   help='use UDPIPE for additional guesses (experi-mental)')
    a.add_argument('--hacks',
                   metavar='HACKS',
                   help="mangle anaelyses to match HACKS version of UD",
                   choices=['ftb'])
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.fsa:
        if options.verbose:
            print("reading language models in", options.fsa)
        omorfi.load_from_dir(options.fsa, analyse=True, guesser=True)
    else:
        if options.verbose:
            print("reading language models in default dirs")
        omorfi.load_from_dir()
    if options.udpipe:
        if options.verbose:
            print("Loading udpipe", options.udpipe)
        omorfi.load_udpipe(options.udpipe)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    recognised_comments = [
        'sent_id =', 'text =', 'doc-name:', 'sentence-text:'
    ]
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # conllu is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                if '-' in fields[0]:
                    # MWE
                    continue
                elif '.' in fields[0]:
                    # a ghost
                    continue
                else:
                    print("Cannot figure out token index",
                          fields[0],
                          file=stderr)
                    exit(1)
            surf = fields[1]
            anals = omorfi.analyse(surf)
            if not anals or len(anals) == 0 or (len(anals) == 1
                                                and 'UNKNOWN' in anals[0][0]):
                unknowns += 1
                anals = omorfi.guess(surf)
            if anals and len(anals) > 0:
                if options.debug:
                    debug_analyses_conllu(fields, index, surf, anals,
                                          options.outfile, options.hacks)
                elif options.oracle:
                    try_analyses_conllu(fields, index, surf, anals,
                                        options.outfile, options.hacks)
                else:
                    print_analyses_conllu(index, surf, anals[0],
                                          options.outfile, options.hacks)
        elif line.startswith('#'):
            print(line.strip(), file=options.outfile)
            recognised = False
            for rec in recognised_comments:
                if line.startswith('# ' + rec):
                    recognised = True
            if not recognised and options.verbose:
                print("Warning! Unrecognised comment line:", line, sep='\n')
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in conllu format:", line, sep='\n', file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)
예제 #3
0
def main():
    """Invoke a simple CLI analyser."""
    a = ArgumentParser()
    a.add_argument('-a',
                   '--analyser',
                   metavar='AFILE',
                   help="read analyser model from AFILE",
                   required=True)
    a.add_argument('-i',
                   '--input',
                   metavar="INFILE",
                   type=open,
                   dest="infile",
                   help="source of analysis data")
    a.add_argument('-v',
                   '--verbose',
                   action='store_true',
                   help="print verbosely while processing")
    a.add_argument('-o',
                   '--output',
                   metavar="OUTFILE",
                   dest="outfile",
                   help="print output into OUTFILE",
                   type=FileType('w'))
    a.add_argument('-x',
                   '--statistics',
                   metavar="STATFILE",
                   dest="statfile",
                   help="print statistics to STATFILE",
                   type=FileType('w'))
    a.add_argument('-O',
                   '--oracle',
                   action='store_true',
                   help="match to values in input when parsing if possible")
    a.add_argument('-X',
                   '--frequencies',
                   metavar="FREQDIR",
                   help="read frequencies from FREQDIR/*.freqs")
    a.add_argument('--debug',
                   action='store_true',
                   help="print lots of debug info while processing")
    options = a.parse_args()
    if options.verbose:
        print("Printing verbosely")
    omorfi = Omorfi(options.verbose)
    if options.analyser:
        if options.verbose:
            print("reading analyser model", options.analyser)
        omorfi.load_analyser(options.analyser)
    else:
        print("analyser is needed to ftb3", file=stderr)
        exit(4)
    if not options.infile:
        print("reading from <stdin>")
        options.infile = stdin
    if options.verbose:
        print("analysing", options.infile.name)
    if not options.outfile:
        options.outfile = stdout
    if options.verbose:
        print("writing to", options.outfile.name)
    if not options.statfile:
        options.statfile = stdout
    lexprobs = None
    tagprobs = None

    if options.frequencies:
        with open(options.frequencies + '/lexemes.freqs') as lexfile:
            omorfi.load_lexical_frequencies(lexfile)
        with open(options.frequencies + '/omors.freqs') as omorfile:
            omorfi.load_omortag_frequencies(omorfile)

    # statistics
    realstart = perf_counter()
    cpustart = process_time()
    tokens = 0
    unknowns = 0
    sentences = 0
    for line in options.infile:
        fields = line.strip().split('\t')
        if len(fields) == 10:
            # ftb is 10 field format
            tokens += 1
            try:
                index = int(fields[0])
            except ValueError:
                print("Cannot figure out token index", fields[0], file=stderr)
                exit(1)
            surf = fields[1]
            anals = omorfi.analyse(surf)
            if not anals or len(anals) == 0 or (len(anals) == 1
                                                and 'OOV' in anals[0]):
                unknowns += 1
                anals = omorfi.guess(surf)
            if anals and len(anals) > 0:
                if options.oracle:
                    try_analyses_ftb(fields, index, surf, anals,
                                     options.outfile)
                else:
                    print_analyses_ftb(index, surf, anals[0], options.outfile)
            else:
                print("Failed:", fields)
                exit(1)
        elif line.startswith('<') and line.rstrip().endswith('>'):
            print(line.strip(), file=options.outfile)
        elif not line or line.strip() == '':
            # retain exactly 1 empty line between sents
            print(file=options.outfile)
            sentences += 1
        else:
            print("Error in ftb3 format: '", line, "'", file=stderr)
            exit(1)
    cpuend = process_time()
    realend = perf_counter()
    print("Tokens:", tokens, "Sentences:", sentences, file=options.statfile)
    print("Unknowns / OOV:",
          unknowns,
          "=",
          unknowns / tokens * 100 if tokens != 0 else 0,
          "%",
          file=options.statfile)
    print("CPU time:",
          cpuend - cpustart,
          "Real time:",
          realend - realstart,
          file=options.statfile)
    print("Tokens per timeunit:",
          tokens / (realend - realstart),
          file=options.statfile)
    exit(0)