def lookup_gloss(self, gloss, gdict): 'Gloss, Dictionary -> tuple(Gloss)' lookup_form = None parts = None try: if self.detone: bare = detone(gloss.form) if bare in gdict: lookup_form = bare else: if gloss.form in gdict: lookup_form = gloss.form elif '-' in gloss.form: parts = gloss.form.split('-') lookup_form = ''.join(parts) else: bare = detone(gloss.form) if not gloss.form == bare and bare in gdict: lookup_form = bare if lookup_form: pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss) if parts: out = [] for dgloss in gdict[lookup_form]: if dgloss.matches(pattern) and len( dgloss.morphemes) == len( parts) and tones_match( gloss.form, dgloss.form): out.append(dgloss) return tuple(out) else: return tuple([ dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern) and tones_match(gloss.form, dgloss.form) ]) else: return () except (KeyError, AttributeError): if gloss.form in gdict: print('PP', gloss.form, gdict[gloss.form]) return ()
def get_case(self, string): string = detone(string) if string[0] in [u"-‑‐–=\"'\u2018\u201c"]: string = string[1:] if string.isupper(): case = unicode.upper elif string.istitle(): case = unicode.title else: case = unicode.lower return case
def make_taglist(glosses, formforlemma=False, tonal=False): result = [] for g in glosses: if formforlemma: lemma = dedot(g.form) else: lemma = make_lemma(g) if not tonal: lemma = detone(lemma) for tag in g.ps: result.append((tag, lemma)) return result
def make_lemmafunc(args): if args.tonal: get_lemma = lambda x: dedot(x) elif args.nullify: nullify_dict={u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'} def get_lemma(x): x = detone(''.join(c for c in x if c not in '.')) for source, target in nullify_dict.items(): x = x.replace(source, target) return x else: get_lemma = lambda x: detone(dedot(x)) return get_lemma
def make_lemmafunc(args): if args.tonal: get_lemma = lambda x: dedot(x) elif args.nullify: nullify_dict = {u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'} def get_lemma(x): x = detone(''.join(c for c in x if c not in '.')) for source, target in nullify_dict.items(): x = x.replace(source, target) return x else: get_lemma = lambda x: detone(dedot(x)) return get_lemma
def write(self): with open(self.filename, 'w', encoding=self.encoding) as outfile: outfile.write(u'# <doc path={}'.format(self.filename)) for (name, content) in self.metadata.items(): outfile.write(u' {}={}'.format(name, content)) outfile.write(u'>\n') for p in self.para: for (senttoken, sentannot) in p: for gt in sentannot: if gt.type == 'w': token = detone(gt.gloss.form) if gt.token.isupper(): token = token.upper() elif gt.token.istitle(): token = token[0].upper() + token[1:] else: token = gt.value outfile.write(u'{}\n'.format(token)) outfile.write("\n") outfile.write("\n")
def write(self): with open(self.filename, 'w') as outfile: outfile.write(u'# <doc path={}'.format(self.filename).encode('utf-8')) for (name, content) in self.metadata.items(): outfile.write(u' {}={}'.format(name, content).encode('utf-8')) outfile.write(u'>\n'.encode('utf-8')) for p in self.para: for (senttext, sentannot) in p: for gt in sentannot: if gt.type == 'w': token = detone(gt.gloss.form) if gt.token.isupper(): token = token.upper() elif gt.token.istitle(): token = token[0].upper() + token[1:] else: token = gt.value outfile.write(u'{}\n'.format(token).encode('utf-8')) outfile.write("\n".encode('utf-8')) outfile.write("\n".encode('utf-8'))
def main(): aparser = argparse.ArgumentParser( description='Lexicon printer for TreeTagger training') aparser.add_argument("-r", "--runtimedir", help="Runtime dir with binary saved dictionaries") aparser.add_argument("-t", "--tonal", action="store_true", help="Preserve tones on lemmas") aparser.add_argument("-w", "--tonalwords", action="store_true", help="Preserve tones on words") aparser.add_argument("-j", "--join", action="store_true", help="Join all sources") aparser.add_argument("-p", "--plain", action="store_true", help="Output plain lists of tokens") aparser.add_argument("-c", "--corpus", default=None, help="Corpus root") aparser.add_argument("-l", "--filelist", action="store", help="List of corpus files to parse") aparser.add_argument("-n", "--nopunct", action="store_true", help="Skip punctuation") args = aparser.parse_args() #locale.setlocale(locale.LC_ALL, 'bm_ML') if args.join: globaldict = defaultdict(list) if args.corpus: seentokens = set() filelist = [] with open(args.filelist, 'r') as f: for line in f: filelist.append(line.strip()) for filename in filelist: parsfile = os.path.join(args.corpus, filename) try: reader = daba.formats.HtmlReader(parsfile) lastpunct = None for token in reader: if token.type == 'w': if not args.nopunct and lastpunct: punct = [(lastpunct.type, lastpunct.value)] if args.join: globaldict[lastpunct.value.strip()].extend( punct) else: print_line(lastpunct.value.strip(), punct) lastpunct = None form = dedot(token.glosslist[0].form).lower() if not args.tonalwords: form = detone(form) else: # FIXME: unsupported tonal for corpus pass result = make_taglist(token.glosslist, tonal=args.tonal) if args.plain: print_line(form, result) else: if form not in seentokens: seentokens.add(form) print_line(form, result) if args.join: if result not in globaldict[form]: globaldict[form].extend(result) elif token.type == 'c' and not args.nopunct: lastpunct = token elif token.type == '</s>' and not args.nopunct: if lastpunct: punct = [('SENT', lastpunct.value)] if args.join: globaldict[lastpunct.value.strip()].extend( punct) else: print_line(lastpunct.value.strip(), punct) lastpunct = None except (cElementTree.ParseError) as e: sys.stderr.write(u'File format error: {}\n'.format(filename)) sys.stderr.write(u'ERR {}\n'.format(e)) if args.runtimedir: seenkeys = set() dictionary = daba.mparser.DictLoader( runtimedir=args.runtimedir).dictionary for form in dictionary: if ' ' not in form: if not args.tonal: form = detone(form) if args.plain: for gloss in dictionary[form]: print(gloss) result = make_taglist([gloss], formforlemma=True, tonal=args.tonal) for lemma in result: print_line(form, [lemma]) else: if args.corpus and form in seentokens: continue if form not in seenkeys: glosses = dictionary[form] result = make_taglist(glosses, formforlemma=True, tonal=args.tonal) seenkeys.add(form) if args.join: globaldict[form].extend(result) else: print_line(form, result) if args.join: for form, result in globaldict.iteritems(): print_line(form, result)
def push_items(primarykey, lemmalist): for key, lx in lemmalist: self._dict[key] = lx detonedkey = detone(key) if not detonedkey == key: self._dict[detonedkey] = lx
def print_token(gt, args, vardict, polidict, get_lemma, sent=False): if gt.type == 'Comment': return if not gt.type == "w": print u"{0}\t".format(gt.token).encode('utf-8'), if gt.type == 'w': normalized = gt.glosslist[0].form if ' ' in normalized: words = normalized.split(' ') for word in words: gt.glosslist[0] = gt.glosslist[0]._replace(form=word) print_token(gt, args, vardict, polidict, get_lemma, sent=sent) return if args.convert and not args.keepsource: token = get_lemma(normalized) if args.tonal and args.conll: token = detone(token) else: token = gt.token print u"{0}\t".format(token).encode('utf-8'), tonals = [] fields = [] lemmas = [] tags = set() glosses = [] igtforms = [] igtglosses = [] deep = [] polisemy = [] for g in gt.glosslist: tags = tags.union(g.ps) if re.match(r'^[A-Z0-9.]+$', g.gloss): gls = g.gloss else: gls = dedot(g.gloss, '_') if not gls and g.morphemes: gls = '-'.join([m.gloss for m in g.morphemes]) glosses.append(gls) if not args.tonal: if g.morphemes: tonals.append(''.join([dedot(m.form) for m in g.morphemes])) else: tonals.append(dedot(g.form)) if g.morphemes: #HACK: if we have no gloss on the top, make up lemma from morphemes # targeted at inflected forms analyzed by the parser if [m for m in g.morphemes if 'mrph' not in m.ps]: lemmas.append(get_lemma(''.join([dedot(m.form) for m in g.morphemes if m.gloss not in args.flective.split(',')]))) else: lemmas.append(get_lemma(g.form)) if args.igt: if not g.gloss: igtforms.append('-'.join([dedot(m.form) for m in g.morphemes])) gls = [] for m in g.morphemes: if m.gloss.isupper(): gls.append(m.gloss) else: gls.append(dedot(m.gloss, '_')) igtglosses.append('-'.join(gls)) else: igtforms.append(dedot(g.form)) for m in g.morphemes: if not args.conll: # add grammatical glosses to tags if m.gloss.isupper(): tags.add(m.gloss) if 'mrph' not in m.ps: deep.append(get_lemma(m.form)) #deep.append(m.gloss) else: if args.igt: igtforms.append(dedot(g.form)) lemmas.append(get_lemma(g.form)) if args.variants: if g in vardict: if args.canonical: try: lemmas = [get_lemma(vardict[g][0])] except IndexError: pass else: for variant in vardict[g]: lemmas.append(get_lemma(variant)) #HACK truncate extra long glosses lists if len(glosses)>10: glosses = glosses[:10] glosses.append('...') fields = [lemmas, tags, glosses, deep] if args.convert: if args.keepsource: fields.append([normalized]) else: fields.append([gt.token]) if not args.tonal: fields.append(tonals) if args.polisemy: for ge, gvs in polidict[dedot(g.form)].items(): if dedot(ge, '_') in glosses: polisemy.extend(gvs) fields.append(polisemy) if args.igt: fields.append(igtforms) fields.append(igtglosses) if args.debugfields: fields.append([make_tagstring(g) for g in gt.glosslist]) print_fields(fields, unique=args.unique) else: nfields = 5 if args.polisemy: nfields += 1 if args.igt: nfields += 2 if args.convert: nfields += 1 if not args.tonal: nfields += 1 if args.debugfields: nfields += 1 if sent and gt.type == 'c': ctag = args.senttag else: ctag = gt.type print u"\t".join([gt.token, ctag] + [gt.token]*(nfields-2)).encode('utf-8')
def get_lemma(x): x = detone(''.join(c for c in x if c not in '.')) for source, target in nullify_dict.items(): x = x.replace(source, target) return x
def __init__(self, filename, encoding='utf-8', store=True, variants=False, polisemy=False, keepmrph=False, normalize=True, ignorelist=('i',), inverse=False, lemmafields=('lx', 'le', 'va', 'vc', 'a'), glossfields=('dff', 'gf', 'ge'), canonical=False): self._dict = DabaDict() self._variants = VariantsDict(canonical=canonical) self._polisemy = defaultdict(ddlist) self.keepmrph = keepmrph self.normalize = normalize self.line = 0 self.ignorelist = ignorelist self.inverse = inverse self.lemmafields = lemmafields self.glossfields = glossfields ignore = False lemmalist = [] key = None ps = () def parsemm(v): try: f, p, g = v.split(':') if p: ps = tuple(p.split('/')) else: ps = () return Gloss(f, ps, g, ()) except (ValueError): print "Error line:", str(self.line), unicode(v).encode('utf-8') def normalize(value): return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None}).lower()) def make_item(value): if self.normalize: value = normalize(value) return [value, Gloss(form=value, ps=(), gloss="", morphemes=())] def push_items(primarykey, lemmalist): for key, lx in lemmalist: self._dict[key] = lx detonedkey = detone(key) if not detonedkey == key: self._dict[detonedkey] = lx def select_gloss(glossdict): ge = '' for f in self.glossfields: try: ge = glossdict[f] break except KeyError: pass return ge def process_record(lemmalist): if lemmalist: ge = select_gloss(glossdict) if self.inverse: key = u'_'.join(['/'.join(ps), ge]) lemmalist = [(key, g._replace(ps=ps, gloss=ge)) for k, g in lemmalist] push_items(key, lemmalist) else: lemmalist = [(key, item._replace(ps=ps, gloss=ge)) for key, item in lemmalist] if not ps == ('mrph',) or self.keepmrph: if store: push_items(key, lemmalist) if variants and len(lemmalist) > 1: self._variants.add(zip(*lemmalist)[1]) with codecs.open(filename, 'r', encoding=encoding) as dictfile: for line in dictfile: self.line = self.line + 1 # end of the artice/dictionary if not line or line.isspace(): if not ignore: process_record(lemmalist) ignore = False lemmalist = [] ps = () glossdict = {} key = None elif line.startswith('\\'): line = unicodedata.normalize('NFKD', line) tag, space, value = line[1:].partition(' ') value = value.strip() if tag in ['lang', 'ver', 'name']: self._dict.__setattr__(tag, value) elif tag in self.ignorelist: ignore = True elif tag in self.lemmafields: if self.normalize: key = normalize(value) else: key = value lemmalist.append(make_item(value)) elif tag in ['mm']: lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),)) elif tag in ['ps'] and not ps: if value: ps = tuple(value.split('/')) else: ps = () elif tag in self.glossfields: glossdict[tag] = value elif tag in ['gv']: if polisemy: self._polisemy[key][select_gloss(glossdict)].append(value) dk = detone(key) if not dk == key: self._polisemy[dk][select_gloss(glossdict)].append(value) else: process_record(lemmalist) if not self._dict.attributed(): print r"Dictionary does not contain obligatory \lang, \name or \ver fields.\ Please specify them and try to load again." print self._dict.lang, self._dict.name, self._dict.ver
def __init__(self, filename, encoding='utf-8', store=True, variants=False, polisemy=False, keepmrph=False, normalize=True, ignorelist=('i',), inverse=False, lemmafields=('lx', 'le', 'va', 'vc', 'a'), glossfields=('gf', 'ge', 'dff'), canonical=False): self._dict = DabaDict() self._variants = VariantsDict(canonical=canonical) self._polisemy = defaultdict(ddlist) self.keepmrph = keepmrph self.normalize = normalize self.line = 0 self.ignorelist = ignorelist self.inverse = inverse self.lemmafields = lemmafields self.glossfields = glossfields ignore = False lemmalist = [] key = None ps = () def parsemm(v): try: f, p, g = v.split(':') if p: ps = tuple(p.split('/')) else: ps = () return Gloss(f, ps, g, ()) except (ValueError): print("Error line:", str(self.line), str(v)) def normalize(value): return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None}).lower()) def make_item(value): if self.normalize: value = normalize(value) return [value, Gloss(form=value, ps=(), gloss="", morphemes=())] def push_items(primarykey, lemmalist): for key, lx in lemmalist: self._dict[key] = lx detonedkey = detone(key) if not detonedkey == key: self._dict[detonedkey] = lx def select_gloss(glossdict): ge = '' for f in self.glossfields: try: ge = glossdict[f] break except KeyError: pass return ge def process_record(key, lemmalist): if lemmalist: ge = select_gloss(glossdict) if self.inverse: key = u'_'.join(['/'.join(ps), ge]) lemmalist = [(key, g._replace(ps=ps, gloss=ge)) for k, g in lemmalist] push_items(key, lemmalist) else: lemmalist = [(key, item._replace(ps=ps, gloss=ge)) for key, item in lemmalist] if not ps == ('mrph',) or self.keepmrph: if store: push_items(key, lemmalist) if variants and len(lemmalist) > 1: self._variants.add(list(zip(*lemmalist))[1]) with codecs.open(filename, 'r', encoding=encoding) as dictfile: for line in dictfile: self.line = self.line + 1 # end of the artice/dictionary if not line or line.isspace(): if not ignore: process_record(key, lemmalist) ignore = False lemmalist = [] ps = () glossdict = {} key = None elif line.startswith('\\'): line = unicodedata.normalize('NFKD', line) tag, space, value = line[1:].partition(' ') value = value.strip() if tag in ['lang', 'ver', 'name']: self._dict.__setattr__(tag, value) elif tag in self.ignorelist: ignore = True elif tag in self.lemmafields: if self.normalize: key = normalize(value) else: key = value lemmalist.append(make_item(value)) elif tag in ['mm']: lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),)) elif tag in ['ps'] and not ps: if value: ps = tuple(value.split('/')) else: ps = () elif tag in self.glossfields: glossdict[tag] = value elif tag in ['gv']: if polisemy: self._polisemy[key][select_gloss(glossdict)].append(value) dk = detone(key) if not dk == key: self._polisemy[dk][select_gloss(glossdict)].append(value) else: process_record(key, lemmalist) if not self._dict.attributed(): print(r"Dictionary does not contain obligatory \lang, \name or \ver fields.\ Please specify them and try to load again.") print(self._dict.lang, self._dict.name, self._dict.ver)
def get_case(self, string): string = detone(string) if string.isupper(): case = unicode.upper
def get_case(self, string): string = detone(string) if string[0] in [u"-‑‐–=\"'\u2018\u201c"]: string = string[1:] if string.isupper(): case = str.upper
def print_token(gt, args, vardict, polidict, get_lemma, sent=False): if gt.type in ['Comment', '<s>', '<p>']: return if not gt.type == "w": print(gt.token, end="\t") if gt.type == 'w': normalized = gt.glosslist[0].form if ' ' in normalized: words = normalized.split(' ') for word in words: gt.glosslist[0] = gt.glosslist[0]._replace(form=word) print_token(gt, args, vardict, polidict, get_lemma, sent=sent) return if args.convert and not args.keepsource: token = get_lemma(normalized) if args.tonal and args.conll: token = detone(token) else: token = gt.token print(token, end="\t") tonals = [] fields = [] lemmas = [] tags = set() glosses = [] igtforms = [] igtglosses = [] deep = [] polisemy = [] for g in gt.glosslist: tags = tags.union(g.ps) if re.match(r'^[A-Z0-9.]+$', g.gloss): gls = g.gloss else: gls = dedot(g.gloss, '_') if not gls and g.morphemes: gls = '-'.join([m.gloss for m in g.morphemes]) if not args.nogloss: glosses.append(gls) if not args.tonal: if g.morphemes: tonals.append(''.join([dedot(m.form) for m in g.morphemes])) else: tonals.append(dedot(g.form)) if g.morphemes: #HACK: if we have no gloss on the top, make up lemma from morphemes # targeted at inflected forms analyzed by the parser if [m for m in g.morphemes if 'mrph' not in m.ps]: lemmas.append( get_lemma(''.join([ dedot(m.form) for m in g.morphemes if m.gloss not in args.flective.split(',') ]))) else: lemmas.append(get_lemma(g.form)) if args.igt: if not g.gloss: igtforms.append('-'.join( [dedot(m.form) for m in g.morphemes])) gls = [] for m in g.morphemes: if m.gloss.isupper(): gls.append(m.gloss) else: gls.append(dedot(m.gloss, '_')) igtglosses.append('-'.join(gls)) else: igtforms.append(dedot(g.form)) for m in g.morphemes: if not args.conll: # add grammatical glosses to tags if m.gloss.isupper(): tags.add(m.gloss) else: glosses.append(m.gloss) if 'mrph' not in m.ps: deep.append(get_lemma(m.form)) #deep.append(m.gloss) else: if args.igt: igtforms.append(dedot(g.form)) lemmas.append(get_lemma(g.form)) if args.variants: if g in vardict: if args.canonical: try: lemmas = [get_lemma(vardict[g][0])] except IndexError: pass else: for variant in vardict[g]: lemmas.append(get_lemma(variant)) #HACK truncate extra long glosses lists if len(glosses) > 10: glosses = glosses[:10] glosses.append('...') fields = [lemmas, tags, glosses, deep] if args.convert: if args.keepsource: fields.append([normalized]) else: fields.append([gt.token]) if not args.tonal: fields.append(tonals) if args.polisemy: for ge, gvs in polidict[dedot(g.form)].items(): if dedot(ge, '_') in glosses: polisemy.extend(gvs) fields.append(polisemy) if args.igt: fields.append(igtforms) fields.append(igtglosses) if args.debugfields: fields.append([make_tagstring(g) for g in gt.glosslist]) print_fields(fields, unique=args.unique) else: nfields = 5 if args.polisemy: nfields += 1 if args.igt: nfields += 2 if args.convert: nfields += 1 if not args.tonal: nfields += 1 if args.debugfields: nfields += 1 if sent and gt.type == 'c': ctag = args.senttag else: ctag = gt.type print(u"\t".join([gt.token, ctag] + [gt.token] * (nfields - 2)))