def main(): load_plugins() aparser = argparse.ArgumentParser(description='Daba suite. Command line morphological parser.') aparser.add_argument('-i', '--infile', help='Input file (.txt or .html)', default="sys.stdin") aparser.add_argument('-o', '--outfile', help='Output file', default="sys.stdout") aparser.add_argument('-s', '--script', action='append', choices=OrthographyConverter.get_plugins().keys(), default=None, help='Perform orthographic conversion operations (defined in plugins). Conversions will be applied in the order they appear on command line.') aparser.add_argument("-d", "--dictionary", action="append", help="Toolbox dictionary file (may be added multiple times)") aparser.add_argument("-g", "--grammar", help="Grammar specification file") aparser.add_argument("-n", "--noparse", action='store_true', help="Do not parse, only process resources") aparser.add_argument("-l", "--list", help="Read input filenames list from file") aparser.add_argument("-t", "--detone", action='store_true', help="Ignore tones in dictionary lookups") aparser.add_argument("-v", "--verbose", action='store_true', help="Print info messages on loaded dictionaries") args = aparser.parse_args() dl = DictLoader(verbose=args.verbose) gr = GrammarLoader() if args.dictionary: for dicfile in args.dictionary: dl.addfile(dicfile) if args.grammar: gr.load(args.grammar) if not args.noparse: pp = Processor(dl, gr, converters=args.script, detone=args.detone) if args.list: with open(args.list) as filelist: for line in filelist: infile = os.path.normpath(line.decode('utf-8').strip()) if os.path.exists(infile): outfile = os.path.splitext(infile)[0] + '.pars.html' parse_file(infile, outfile, pp, args) else: parse_file(args.infile, args.outfile, pp, args) exit(0)
def __init__(self, parent, *args, **kwargs): wx.Panel.__init__(self, parent, *args, **kwargs) #FIXME: make default plugins configurable from config file self.selection = ('apostrophe',) mparser.load_plugins() self.converters = OrthographyConverter.get_plugins().keys() converterbox = wx.StaticBox(self, -1, "Available Orthographic Converters") self.csizer = wx.StaticBoxSizer(converterbox, wx.VERTICAL) self.converterlist = wx.CheckListBox(self, wx.ID_ANY, choices=self.converters) self.converterlist.SetCheckedStrings(self.selection) self.Bind(wx.EVT_CHECKLISTBOX, self.OnSelection, self.converterlist) self.csizer.Add(self.converterlist, 0, wx.TOP|wx.LEFT, 10) self.SetSizer(self.csizer) self.Layout()
def __init__(self, parent, *args, **kwargs): wx.Panel.__init__(self, parent, *args, **kwargs) #FIXME: make default plugins configurable from config file self.selection = ('apostrophe', ) mparser.load_plugins() self.converters = OrthographyConverter.get_plugins().keys() converterbox = wx.StaticBox(self, -1, "Available Orthographic Converters") self.csizer = wx.StaticBoxSizer(converterbox, wx.VERTICAL) self.converterlist = wx.CheckListBox(self, wx.ID_ANY, choices=self.converters) self.converterlist.SetCheckedStrings(self.selection) self.Bind(wx.EVT_CHECKLISTBOX, self.OnSelection, self.converterlist) self.csizer.Add(self.converterlist, 0, wx.TOP | wx.LEFT, 10) self.SetSizer(self.csizer) self.Layout()
def parse(self, txt): self.parsed = [] for para in txt: par = [] tkz = Tokenizer() for sent in tkz.split_sentences(tkz.tokenize(para)): st = (''.join(t.value for t in sent), []) par.append(st) annot = st[1] prevtoken = False for token in sent: if token.type in ['Comment', 'Tag']: annot.append(formats.GlossToken((token.type, token.value))) elif token.type in ['Punct', 'SentPunct', 'Nonword']: annot.append(formats.GlossToken(('c', token.value))) elif token.type in ['Cardinal']: gloss = Gloss(token.value, ('num',), 'CARDINAL', ()) annot.append(formats.GlossToken(('w', (token.value, 'tokenizer', [gloss])))) elif token.type in ['Word']: if self.converters: wlist = [token.value] for plugin in self.converters: converted = [] for w in wlist: for result in OrthographyConverter.get_plugins()[plugin].convert(w): converted.append(result) wlist = converted converts = [self.parser.lemmatize(w.lower()) for w in wlist] successfull = [x[1] for x in filter(lambda s:s[0]>=0, converts)] or [c[1] for c in converts] stage = max([c[0] for c in converts]) glosslist = [] for gl in successfull: glosslist.extend(gl) else: stage, glosslist = self.parser.lemmatize(token.value.lower()) # suggest proper name variant for capitalized words (not in sentence-initial position) if token.value.istitle() and prevtoken and 'n.prop' not in set([]).union(*[g.ps for g in glosslist]): propn = Gloss(token.value, ('n.prop',), token.value, ()) glosslist.insert(0, propn) annot.append(formats.GlossToken(('w', (token.value, unicode(stage), glosslist)))) prevtoken = True self.parsed.append(par) return self.parsed
def main(): load_plugins() aparser = argparse.ArgumentParser( description='Daba suite. Command line morphological parser.') aparser.add_argument('-i', '--infile', help='Input file (.txt or .html)', default="sys.stdin") aparser.add_argument('-o', '--outfile', help='Output file', default="sys.stdout") aparser.add_argument( '-s', '--script', action='append', choices=OrthographyConverter.get_plugins().keys(), default=None, help= 'Perform orthographic conversion operations (defined in plugins). Conversions will be applied in the order they appear on command line.' ) aparser.add_argument( "-d", "--dictionary", action="append", help="Toolbox dictionary file (may be added multiple times)") aparser.add_argument("-g", "--grammar", help="Grammar specification file") aparser.add_argument("-n", "--noparse", action='store_true', help="Do not parse, only process resources") aparser.add_argument("-l", "--list", help="Read input filenames list from file") aparser.add_argument("-t", "--detone", action='store_true', help="Ignore tones in dictionary lookups") aparser.add_argument("-v", "--verbose", action='store_true', help="Print info messages on loaded dictionaries") args = aparser.parse_args() dl = DictLoader(verbose=args.verbose) gr = GrammarLoader() if args.dictionary: for dicfile in args.dictionary: dl.addfile(dicfile) if args.grammar: gr.load(args.grammar) if not args.noparse: pp = Processor(dl, gr, converters=args.script, detone=args.detone) if args.list: with open(args.list) as filelist: for line in filelist: infile = os.path.normpath(line.decode('utf-8').strip()) if os.path.exists(infile): outfile = os.path.splitext(infile)[0] + '.pars.html' parse_file(infile, outfile, pp, args) else: parse_file(args.infile, args.outfile, pp, args) exit(0)
def parse(self, txt): self.parsed = [] for para in txt: par = [] tkz = Tokenizer() for sent in tkz.split_sentences(tkz.tokenize(para)): st = (''.join(t.value for t in sent), []) par.append(st) annot = st[1] prevtoken = False for token in sent: if token.type in ['Comment', 'Tag']: annot.append( formats.GlossToken((token.type, token.value))) elif token.type in ['Punct', 'SentPunct', 'Nonword']: annot.append(formats.GlossToken(('c', token.value))) elif token.type in ['Cardinal']: gloss = Gloss(token.value, ('num', ), 'CARDINAL', ()) annot.append( formats.GlossToken( ('w', (token.value, 'tokenizer', [gloss])))) elif token.type in ['Word']: if self.converters: wlist = [token.value] for plugin in self.converters: converted = [] for w in wlist: for result in OrthographyConverter.get_plugins( )[plugin].convert(w): converted.append(result) wlist = converted converts = [ self.parser.lemmatize(w.lower()) for w in wlist ] successfull = [ x[1] for x in filter(lambda s: s[0] >= 0, converts) ] or [c[1] for c in converts] stage = max([c[0] for c in converts]) glosslist = [] for gl in successfull: glosslist.extend(gl) else: stage, glosslist = self.parser.lemmatize( token.value.lower()) # suggest proper name variant for capitalized words (not in sentence-initial position) if token.value.istitle( ) and prevtoken and 'n.prop' not in set( []).union(*[g.ps for g in glosslist]): propn = Gloss(token.value, ('n.prop', ), token.value, ()) glosslist.insert(0, propn) annot.append( formats.GlossToken( ('w', (token.value, unicode(stage), glosslist)))) prevtoken = True self.parsed.append(par) return self.parsed
#!/usr/bin/python # -*- coding: utf-8 -*- import os import sys from plugins import OrthographyConverter def load_plugins(): plugindir = os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), "..", "plugins") plugins = [x[:-3] for x in os.listdir(plugindir) if x.endswith(".py") and not x.startswith("__")] for plugin in plugins: mod = __import__(".".join(["plugins", plugin])) load_plugins() converter = OrthographyConverter.get_plugins()["nko"] for line in open(sys.argv[1]): ws = line.decode("utf-8").strip().split() print " ".join([converter.convert(w)[0] for w in ws]).encode("utf-8")
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import os import sys from plugins import OrthographyConverter def load_plugins(): plugindir = os.path.join(os.path.abspath(os.path.dirname(sys.argv[0])), '..', 'plugins') plugins = [x[:-3] for x in os.listdir(plugindir) if x.endswith('.py') and not x.startswith('__')] for plugin in plugins: mod = __import__('.'.join(['plugins', plugin])) load_plugins() converter = OrthographyConverter.get_plugins()["nko"] for line in open(sys.argv[1]): ws = line.strip().split() print(' '.join([converter.convert(w)[0] for w in ws]))