예제 #1
0
def main():
    
    load_plugins() 

    aparser = argparse.ArgumentParser(description='Daba suite. Command line morphological parser.')
    aparser.add_argument('-i', '--infile', help='Input file (.txt or .html)', default="sys.stdin")
    aparser.add_argument('-o', '--outfile', help='Output file', default="sys.stdout")
    aparser.add_argument('-s', '--script', action='append', choices=OrthographyConverter.get_plugins().keys(), default=None, help='Perform orthographic conversion operations (defined in plugins). Conversions will be applied in the order they appear on command line.')
    aparser.add_argument("-d", "--dictionary", action="append", help="Toolbox dictionary file (may be added multiple times)")
    aparser.add_argument("-g", "--grammar", help="Grammar specification file")
    aparser.add_argument("-n", "--noparse", action='store_true', help="Do not parse, only process resources")
    aparser.add_argument("-l", "--list", help="Read input filenames list from file")
    aparser.add_argument("-t", "--detone", action='store_true', help="Ignore tones in dictionary lookups")
    aparser.add_argument("-v", "--verbose", action='store_true', help="Print info messages on loaded dictionaries")
    args = aparser.parse_args()

    dl = DictLoader(verbose=args.verbose)
    gr = GrammarLoader()
    if args.dictionary:
        for dicfile in args.dictionary:
            dl.addfile(dicfile)
    if args.grammar:
        gr.load(args.grammar)
    if not args.noparse:
        pp = Processor(dl, gr, converters=args.script, detone=args.detone)
        if args.list:
            with open(args.list) as filelist:
                for line in filelist:
                    infile = os.path.normpath(line.decode('utf-8').strip())
                    if os.path.exists(infile):
                        outfile = os.path.splitext(infile)[0] + '.pars.html'
                        parse_file(infile, outfile, pp, args)
        else:
            parse_file(args.infile, args.outfile, pp, args)
    exit(0)
예제 #2
0
 def __init__(self, parent, *args, **kwargs):
     wx.Panel.__init__(self, parent, *args, **kwargs)
     #FIXME: make default plugins configurable from config file
     self.selection = ('apostrophe',)
     mparser.load_plugins()
     self.converters = OrthographyConverter.get_plugins().keys()
     converterbox = wx.StaticBox(self, -1, "Available Orthographic Converters")
     self.csizer = wx.StaticBoxSizer(converterbox, wx.VERTICAL)
     self.converterlist = wx.CheckListBox(self, wx.ID_ANY, choices=self.converters)
     self.converterlist.SetCheckedStrings(self.selection)
     self.Bind(wx.EVT_CHECKLISTBOX, self.OnSelection, self.converterlist)
     self.csizer.Add(self.converterlist, 0, wx.TOP|wx.LEFT, 10)
     self.SetSizer(self.csizer)
     self.Layout()
예제 #3
0
파일: gparser.py 프로젝트: eldams/daba
 def __init__(self, parent, *args, **kwargs):
     wx.Panel.__init__(self, parent, *args, **kwargs)
     #FIXME: make default plugins configurable from config file
     self.selection = ('apostrophe', )
     mparser.load_plugins()
     self.converters = OrthographyConverter.get_plugins().keys()
     converterbox = wx.StaticBox(self, -1,
                                 "Available Orthographic Converters")
     self.csizer = wx.StaticBoxSizer(converterbox, wx.VERTICAL)
     self.converterlist = wx.CheckListBox(self,
                                          wx.ID_ANY,
                                          choices=self.converters)
     self.converterlist.SetCheckedStrings(self.selection)
     self.Bind(wx.EVT_CHECKLISTBOX, self.OnSelection, self.converterlist)
     self.csizer.Add(self.converterlist, 0, wx.TOP | wx.LEFT, 10)
     self.SetSizer(self.csizer)
     self.Layout()
예제 #4
0
    def parse(self, txt):
        self.parsed = []
        for para in txt:
            par = []
            tkz = Tokenizer()
            for sent in tkz.split_sentences(tkz.tokenize(para)):
                st = (''.join(t.value for t in sent), [])
                par.append(st)
                annot = st[1]
                prevtoken = False
                for token in sent:
                    if token.type in ['Comment', 'Tag']:
                        annot.append(formats.GlossToken((token.type, token.value)))
                    elif token.type in ['Punct', 'SentPunct', 'Nonword']:
                        annot.append(formats.GlossToken(('c', token.value)))
                    elif token.type in ['Cardinal']:
                        gloss = Gloss(token.value, ('num',), 'CARDINAL', ())
                        annot.append(formats.GlossToken(('w', (token.value, 'tokenizer', [gloss]))))
                    elif token.type in ['Word']:
                        if self.converters:
                            wlist = [token.value]
                            for plugin in self.converters:
                                converted = []
                                for w in wlist:
                                    for result in OrthographyConverter.get_plugins()[plugin].convert(w):
                                        converted.append(result)
                                wlist = converted
                            converts = [self.parser.lemmatize(w.lower()) for w in wlist]
                            successfull = [x[1] for x in filter(lambda s:s[0]>=0, converts)] or [c[1] for c in converts]
                            stage = max([c[0] for c in converts])
                            glosslist = []
                            for gl in successfull:
                                glosslist.extend(gl)
                        else:
                            stage, glosslist = self.parser.lemmatize(token.value.lower())

                        # suggest proper name variant for capitalized words (not in sentence-initial position)
                        if token.value.istitle() and prevtoken and 'n.prop' not in set([]).union(*[g.ps for g in glosslist]):
                            propn = Gloss(token.value, ('n.prop',), token.value, ())
                            glosslist.insert(0, propn)

                        annot.append(formats.GlossToken(('w', (token.value, unicode(stage), glosslist))))
                        prevtoken = True

            self.parsed.append(par)
        return self.parsed
예제 #5
0
def main():

    load_plugins()

    aparser = argparse.ArgumentParser(
        description='Daba suite. Command line morphological parser.')
    aparser.add_argument('-i',
                         '--infile',
                         help='Input file (.txt or .html)',
                         default="sys.stdin")
    aparser.add_argument('-o',
                         '--outfile',
                         help='Output file',
                         default="sys.stdout")
    aparser.add_argument(
        '-s',
        '--script',
        action='append',
        choices=OrthographyConverter.get_plugins().keys(),
        default=None,
        help=
        'Perform orthographic conversion operations (defined in plugins). Conversions will be applied in the order they appear on command line.'
    )
    aparser.add_argument(
        "-d",
        "--dictionary",
        action="append",
        help="Toolbox dictionary file (may be added multiple times)")
    aparser.add_argument("-g", "--grammar", help="Grammar specification file")
    aparser.add_argument("-n",
                         "--noparse",
                         action='store_true',
                         help="Do not parse, only process resources")
    aparser.add_argument("-l",
                         "--list",
                         help="Read input filenames list from file")
    aparser.add_argument("-t",
                         "--detone",
                         action='store_true',
                         help="Ignore tones in dictionary lookups")
    aparser.add_argument("-v",
                         "--verbose",
                         action='store_true',
                         help="Print info messages on loaded dictionaries")
    args = aparser.parse_args()

    dl = DictLoader(verbose=args.verbose)
    gr = GrammarLoader()
    if args.dictionary:
        for dicfile in args.dictionary:
            dl.addfile(dicfile)
    if args.grammar:
        gr.load(args.grammar)
    if not args.noparse:
        pp = Processor(dl, gr, converters=args.script, detone=args.detone)
        if args.list:
            with open(args.list) as filelist:
                for line in filelist:
                    infile = os.path.normpath(line.decode('utf-8').strip())
                    if os.path.exists(infile):
                        outfile = os.path.splitext(infile)[0] + '.pars.html'
                        parse_file(infile, outfile, pp, args)
        else:
            parse_file(args.infile, args.outfile, pp, args)
    exit(0)
예제 #6
0
    def parse(self, txt):
        self.parsed = []
        for para in txt:
            par = []
            tkz = Tokenizer()
            for sent in tkz.split_sentences(tkz.tokenize(para)):
                st = (''.join(t.value for t in sent), [])
                par.append(st)
                annot = st[1]
                prevtoken = False
                for token in sent:
                    if token.type in ['Comment', 'Tag']:
                        annot.append(
                            formats.GlossToken((token.type, token.value)))
                    elif token.type in ['Punct', 'SentPunct', 'Nonword']:
                        annot.append(formats.GlossToken(('c', token.value)))
                    elif token.type in ['Cardinal']:
                        gloss = Gloss(token.value, ('num', ), 'CARDINAL', ())
                        annot.append(
                            formats.GlossToken(
                                ('w', (token.value, 'tokenizer', [gloss]))))
                    elif token.type in ['Word']:
                        if self.converters:
                            wlist = [token.value]
                            for plugin in self.converters:
                                converted = []
                                for w in wlist:
                                    for result in OrthographyConverter.get_plugins(
                                    )[plugin].convert(w):
                                        converted.append(result)
                                wlist = converted
                            converts = [
                                self.parser.lemmatize(w.lower()) for w in wlist
                            ]
                            successfull = [
                                x[1]
                                for x in filter(lambda s: s[0] >= 0, converts)
                            ] or [c[1] for c in converts]
                            stage = max([c[0] for c in converts])
                            glosslist = []
                            for gl in successfull:
                                glosslist.extend(gl)
                        else:
                            stage, glosslist = self.parser.lemmatize(
                                token.value.lower())

                        # suggest proper name variant for capitalized words (not in sentence-initial position)
                        if token.value.istitle(
                        ) and prevtoken and 'n.prop' not in set(
                            []).union(*[g.ps for g in glosslist]):
                            propn = Gloss(token.value, ('n.prop', ),
                                          token.value, ())
                            glosslist.insert(0, propn)

                        annot.append(
                            formats.GlossToken(
                                ('w', (token.value, unicode(stage),
                                       glosslist))))
                        prevtoken = True

            self.parsed.append(par)
        return self.parsed
예제 #7
0
파일: nko2latin.py 프로젝트: Mompolice/daba
#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
import sys
from plugins import OrthographyConverter


def load_plugins():
    plugindir = os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), "..", "plugins")
    plugins = [x[:-3] for x in os.listdir(plugindir) if x.endswith(".py") and not x.startswith("__")]
    for plugin in plugins:
        mod = __import__(".".join(["plugins", plugin]))


load_plugins()
converter = OrthographyConverter.get_plugins()["nko"]

for line in open(sys.argv[1]):
    ws = line.decode("utf-8").strip().split()
    print " ".join([converter.convert(w)[0] for w in ws]).encode("utf-8")
예제 #8
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import os
import sys
from plugins import OrthographyConverter

def load_plugins():
    plugindir = os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), '..', 'plugins')
    plugins = [x[:-3] for x in os.listdir(plugindir) if x.endswith('.py') and not x.startswith('__')]
    for plugin in plugins:
        mod = __import__('.'.join(['plugins', plugin]))


load_plugins()
converter = OrthographyConverter.get_plugins()["nko"]

for line in open(sys.argv[1]):
    ws = line.strip().split()
    print(' '.join([converter.convert(w)[0] for w in ws]))