示例#1
0
 def lookup_gloss(self, gloss, gdict):
     'Gloss, Dictionary -> tuple(Gloss)'
     lookup_form = None
     parts = None
     try:
         if self.detone:
             bare = detone(gloss.form)
             if bare in gdict:
                 lookup_form = bare
         else:
             if gloss.form in gdict:
                 lookup_form = gloss.form
             elif '-' in gloss.form:
                 parts = gloss.form.split('-')
                 lookup_form = ''.join(parts)
             else:
                 bare = detone(gloss.form)
                 if not gloss.form == bare and bare in gdict:
                     lookup_form = bare
         if lookup_form:
             pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss)
             if parts:
                 out = []
                 for dgloss in gdict[lookup_form]:
                     if dgloss.matches(pattern) and len(
                             dgloss.morphemes) == len(
                                 parts) and tones_match(
                                     gloss.form, dgloss.form):
                         out.append(dgloss)
                 return tuple(out)
             else:
                 return tuple([
                     dgloss for dgloss in gdict[lookup_form]
                     if dgloss.matches(pattern)
                     and tones_match(gloss.form, dgloss.form)
                 ])
         else:
             return ()
     except (KeyError, AttributeError):
         if gloss.form in gdict:
             print('PP', gloss.form, gdict[gloss.form])
         return ()
示例#2
0
 def get_case(self, string):
     string = detone(string)
     if string[0] in [u"-‑‐–=\"'\u2018\u201c"]:
         string = string[1:]
     if string.isupper():
         case = unicode.upper
     elif string.istitle():
         case = unicode.title
     else:
         case = unicode.lower
     return case
示例#3
0
def make_taglist(glosses, formforlemma=False, tonal=False):
    result = []
    for g in glosses:
        if formforlemma:
            lemma = dedot(g.form)
        else:
            lemma = make_lemma(g)
        if not tonal:
            lemma = detone(lemma)
        for tag in g.ps:
            result.append((tag, lemma))
    return result
示例#4
0
def make_lemmafunc(args):
    if args.tonal:
        get_lemma = lambda x: dedot(x)
    elif args.nullify:
        nullify_dict={u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'}
        def get_lemma(x):
            x = detone(''.join(c for c in x if c not in '.'))
            for source, target in nullify_dict.items():
                x = x.replace(source, target)
            return x
    else:
        get_lemma = lambda x: detone(dedot(x))
    return get_lemma
示例#5
0
def make_lemmafunc(args):
    if args.tonal:
        get_lemma = lambda x: dedot(x)
    elif args.nullify:
        nullify_dict = {u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'}

        def get_lemma(x):
            x = detone(''.join(c for c in x if c not in '.'))
            for source, target in nullify_dict.items():
                x = x.replace(source, target)
            return x
    else:
        get_lemma = lambda x: detone(dedot(x))
    return get_lemma
示例#6
0
文件: formats.py 项目: israaar/daba
 def write(self):
     with open(self.filename, 'w', encoding=self.encoding) as outfile:
         outfile.write(u'# <doc path={}'.format(self.filename))
         for (name, content) in self.metadata.items():
             outfile.write(u' {}={}'.format(name, content))
         outfile.write(u'>\n')
         for p in self.para:
             for (senttoken, sentannot) in p:
                 for gt in sentannot:
                     if gt.type == 'w':
                         token = detone(gt.gloss.form)
                         if gt.token.isupper():
                             token = token.upper()
                         elif gt.token.istitle():
                             token = token[0].upper() + token[1:]
                     else:
                         token = gt.value
                     outfile.write(u'{}\n'.format(token))
                 outfile.write("\n")
             outfile.write("\n")
示例#7
0
文件: formats.py 项目: maslinych/daba
 def write(self):
     with open(self.filename, 'w') as outfile:
         outfile.write(u'# <doc path={}'.format(self.filename).encode('utf-8'))
         for (name, content) in self.metadata.items():
             outfile.write(u' {}={}'.format(name, content).encode('utf-8'))
         outfile.write(u'>\n'.encode('utf-8'))
         for p in self.para:
             for (senttext, sentannot) in p:
                 for gt in sentannot:
                     if gt.type == 'w':
                         token = detone(gt.gloss.form)
                         if gt.token.isupper():
                             token = token.upper()
                         elif gt.token.istitle():
                             token = token[0].upper() + token[1:]
                     else:
                         token = gt.value
                     outfile.write(u'{}\n'.format(token).encode('utf-8'))
                 outfile.write("\n".encode('utf-8'))
             outfile.write("\n".encode('utf-8'))
示例#8
0
def main():
    aparser = argparse.ArgumentParser(
        description='Lexicon printer for TreeTagger training')
    aparser.add_argument("-r",
                         "--runtimedir",
                         help="Runtime dir with binary saved dictionaries")
    aparser.add_argument("-t",
                         "--tonal",
                         action="store_true",
                         help="Preserve tones on lemmas")
    aparser.add_argument("-w",
                         "--tonalwords",
                         action="store_true",
                         help="Preserve tones on words")
    aparser.add_argument("-j",
                         "--join",
                         action="store_true",
                         help="Join all sources")
    aparser.add_argument("-p",
                         "--plain",
                         action="store_true",
                         help="Output plain lists of tokens")
    aparser.add_argument("-c", "--corpus", default=None, help="Corpus root")
    aparser.add_argument("-l",
                         "--filelist",
                         action="store",
                         help="List of corpus files to parse")
    aparser.add_argument("-n",
                         "--nopunct",
                         action="store_true",
                         help="Skip punctuation")
    args = aparser.parse_args()

    #locale.setlocale(locale.LC_ALL, 'bm_ML')

    if args.join:
        globaldict = defaultdict(list)

    if args.corpus:
        seentokens = set()
        filelist = []
        with open(args.filelist, 'r') as f:
            for line in f:
                filelist.append(line.strip())
        for filename in filelist:
            parsfile = os.path.join(args.corpus, filename)
            try:
                reader = daba.formats.HtmlReader(parsfile)
                lastpunct = None
                for token in reader:
                    if token.type == 'w':
                        if not args.nopunct and lastpunct:
                            punct = [(lastpunct.type, lastpunct.value)]
                            if args.join:
                                globaldict[lastpunct.value.strip()].extend(
                                    punct)
                            else:
                                print_line(lastpunct.value.strip(), punct)
                            lastpunct = None
                        form = dedot(token.glosslist[0].form).lower()
                        if not args.tonalwords:
                            form = detone(form)
                        else:
                            # FIXME: unsupported tonal for corpus
                            pass
                        result = make_taglist(token.glosslist,
                                              tonal=args.tonal)
                        if args.plain:
                            print_line(form, result)
                        else:
                            if form not in seentokens:
                                seentokens.add(form)
                                print_line(form, result)
                            if args.join:
                                if result not in globaldict[form]:
                                    globaldict[form].extend(result)
                    elif token.type == 'c' and not args.nopunct:
                        lastpunct = token
                    elif token.type == '</s>' and not args.nopunct:
                        if lastpunct:
                            punct = [('SENT', lastpunct.value)]
                            if args.join:
                                globaldict[lastpunct.value.strip()].extend(
                                    punct)
                            else:
                                print_line(lastpunct.value.strip(), punct)
                            lastpunct = None
            except (cElementTree.ParseError) as e:
                sys.stderr.write(u'File format error: {}\n'.format(filename))
                sys.stderr.write(u'ERR {}\n'.format(e))

    if args.runtimedir:
        seenkeys = set()
        dictionary = daba.mparser.DictLoader(
            runtimedir=args.runtimedir).dictionary
        for form in dictionary:
            if ' ' not in form:
                if not args.tonal:
                    form = detone(form)
                if args.plain:
                    for gloss in dictionary[form]:
                        print(gloss)
                        result = make_taglist([gloss],
                                              formforlemma=True,
                                              tonal=args.tonal)
                        for lemma in result:
                            print_line(form, [lemma])
                else:
                    if args.corpus and form in seentokens:
                        continue
                    if form not in seenkeys:
                        glosses = dictionary[form]
                        result = make_taglist(glosses,
                                              formforlemma=True,
                                              tonal=args.tonal)
                        seenkeys.add(form)
                        if args.join:
                            globaldict[form].extend(result)
                        else:
                            print_line(form, result)

    if args.join:
        for form, result in globaldict.iteritems():
            print_line(form, result)
示例#9
0
文件: formats.py 项目: israaar/daba
 def push_items(primarykey, lemmalist):
     for key, lx in lemmalist:
         self._dict[key] = lx
         detonedkey = detone(key)
         if not detonedkey == key:
             self._dict[detonedkey] = lx
示例#10
0
def print_token(gt, args, vardict, polidict, get_lemma, sent=False):
    if gt.type == 'Comment':
        return
    if not gt.type == "w":
        print u"{0}\t".format(gt.token).encode('utf-8'),
    if gt.type == 'w':
        normalized = gt.glosslist[0].form
        if ' ' in normalized:
            words = normalized.split(' ')
            for word in words:
                gt.glosslist[0] = gt.glosslist[0]._replace(form=word)
                print_token(gt, args, vardict, polidict, get_lemma, sent=sent)
            return
        if args.convert and not args.keepsource:
            token = get_lemma(normalized)
            if args.tonal and args.conll:
                token = detone(token)
        else:
            token = gt.token
        print u"{0}\t".format(token).encode('utf-8'),

        tonals = []
        fields = []
        lemmas = []
        tags = set()
        glosses = []
        igtforms = []
        igtglosses = []
        deep = []
        polisemy = []
        for g in gt.glosslist:
            tags = tags.union(g.ps)
            if re.match(r'^[A-Z0-9.]+$', g.gloss):
                gls = g.gloss
            else:
                gls = dedot(g.gloss, '_')
            if not gls and g.morphemes:
                gls = '-'.join([m.gloss for m in g.morphemes])
            glosses.append(gls)
            if not args.tonal:
                if g.morphemes:
                    tonals.append(''.join([dedot(m.form) for m in g.morphemes]))
                else:
                    tonals.append(dedot(g.form))
            if g.morphemes:
                #HACK: if we have no gloss on the top, make up lemma from morphemes
                # targeted at inflected forms analyzed by the parser
                if [m for m in g.morphemes if 'mrph' not in m.ps]:
                    lemmas.append(get_lemma(''.join([dedot(m.form) for m in g.morphemes if m.gloss not in args.flective.split(',')])))
                else:
                    lemmas.append(get_lemma(g.form))

                if args.igt:
                    if not g.gloss:
                        igtforms.append('-'.join([dedot(m.form) for m in g.morphemes]))
                        gls = []
                        for m in g.morphemes:
                            if m.gloss.isupper():
                                gls.append(m.gloss)
                            else:
                                gls.append(dedot(m.gloss, '_'))
                        igtglosses.append('-'.join(gls))
                    else:
                        igtforms.append(dedot(g.form))
                for m in g.morphemes:
                    if not args.conll:
                        # add grammatical glosses to tags
                        if m.gloss.isupper():
                            tags.add(m.gloss)
                    if 'mrph' not in m.ps:
                        deep.append(get_lemma(m.form))
                        #deep.append(m.gloss)
            else:
                if args.igt:
                    igtforms.append(dedot(g.form))
                lemmas.append(get_lemma(g.form))

            if args.variants:
                if g in vardict:
                    if args.canonical:
                        try:
                            lemmas = [get_lemma(vardict[g][0])]
                        except IndexError:
                            pass
                    else:
                        for variant in vardict[g]:
                            lemmas.append(get_lemma(variant))
                
            #HACK truncate extra long glosses lists
            if len(glosses)>10:
                glosses = glosses[:10]
                glosses.append('...')

            fields = [lemmas, tags, glosses, deep]

            if args.convert:
                if args.keepsource:
                    fields.append([normalized])
                else:
                    fields.append([gt.token])

            if not args.tonal:
                fields.append(tonals)

            if args.polisemy:
                for ge, gvs in polidict[dedot(g.form)].items():
                    if dedot(ge, '_') in glosses:
                        polisemy.extend(gvs)
                fields.append(polisemy)
            
            if args.igt:
                fields.append(igtforms)
                fields.append(igtglosses)

            if args.debugfields:
                fields.append([make_tagstring(g) for g in gt.glosslist])
                
        print_fields(fields, unique=args.unique)

    else:
        nfields = 5
        if args.polisemy:
            nfields += 1
        if args.igt:
            nfields += 2
        if args.convert:
            nfields += 1
        if not args.tonal:
            nfields += 1
        if args.debugfields:
            nfields += 1

        if sent and gt.type == 'c':
            ctag = args.senttag
        else:
            ctag = gt.type
        print u"\t".join([gt.token, ctag] + [gt.token]*(nfields-2)).encode('utf-8')
示例#11
0
 def get_lemma(x):
     x = detone(''.join(c for c in x if c not in '.'))
     for source, target in nullify_dict.items():
         x = x.replace(source, target)
     return x
示例#12
0
 def get_lemma(x):
     x = detone(''.join(c for c in x if c not in '.'))
     for source, target in nullify_dict.items():
         x = x.replace(source, target)
     return x
示例#13
0
文件: formats.py 项目: maslinych/daba
 def push_items(primarykey, lemmalist):
     for key, lx in lemmalist:
         self._dict[key] = lx
         detonedkey = detone(key)
         if not detonedkey == key:
             self._dict[detonedkey] = lx
示例#14
0
文件: formats.py 项目: maslinych/daba
    def __init__(self, filename, encoding='utf-8', store=True,
                 variants=False, polisemy=False, keepmrph=False,
                 normalize=True, ignorelist=('i',), inverse=False,
                 lemmafields=('lx', 'le', 'va', 'vc', 'a'),
                 glossfields=('dff', 'gf', 'ge'), canonical=False):

        self._dict = DabaDict()
        self._variants = VariantsDict(canonical=canonical)
        self._polisemy = defaultdict(ddlist)
        self.keepmrph = keepmrph
        self.normalize = normalize
        self.line = 0
        self.ignorelist = ignorelist
        self.inverse = inverse
        self.lemmafields = lemmafields
        self.glossfields = glossfields
        ignore = False
        lemmalist = []
        key = None
        ps = ()

        def parsemm(v):
            try:
                f, p, g = v.split(':')
                if p:
                    ps = tuple(p.split('/'))
                else:
                    ps = ()
                return Gloss(f, ps, g, ())
            except (ValueError):
                print "Error line:", str(self.line), unicode(v).encode('utf-8')

        def normalize(value):
            return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None}).lower())

        def make_item(value):
            if self.normalize:
                value = normalize(value)
            return [value, Gloss(form=value, ps=(), gloss="", morphemes=())]

        def push_items(primarykey, lemmalist):
            for key, lx in lemmalist:
                self._dict[key] = lx
                detonedkey = detone(key)
                if not detonedkey == key:
                    self._dict[detonedkey] = lx

        def select_gloss(glossdict):
            ge = ''
            for f in self.glossfields:
                try:
                    ge = glossdict[f]
                    break
                except KeyError:
                    pass
            return ge
                    
        def process_record(lemmalist):
            if lemmalist:
                ge = select_gloss(glossdict)
                if self.inverse:
                    key = u'_'.join(['/'.join(ps), ge])
                    lemmalist = [(key, g._replace(ps=ps, gloss=ge)) for k, g in lemmalist]
                    push_items(key, lemmalist)
                else:
                    lemmalist = [(key, item._replace(ps=ps, gloss=ge)) for key, item in lemmalist]
                    if not ps == ('mrph',) or self.keepmrph:
                        if store:
                            push_items(key, lemmalist)
                        if variants and len(lemmalist) > 1:
                            self._variants.add(zip(*lemmalist)[1])

        with codecs.open(filename, 'r', encoding=encoding) as dictfile:
            for line in dictfile:
                self.line = self.line + 1
                # end of the artice/dictionary
                if not line or line.isspace():
                    if not ignore:
                        process_record(lemmalist)
                    ignore = False
                    lemmalist = []
                    ps = ()
                    glossdict = {}
                    key = None
                elif line.startswith('\\'):
                    line = unicodedata.normalize('NFKD', line)
                    tag, space, value = line[1:].partition(' ')
                    value = value.strip()
                    if tag in ['lang', 'ver', 'name']:
                        self._dict.__setattr__(tag, value)
                    elif tag in self.ignorelist:
                        ignore = True
                    elif tag in self.lemmafields:
                        if self.normalize:
                            key = normalize(value)
                        else:
                            key = value
                        lemmalist.append(make_item(value))
                    elif tag in ['mm']:
                        lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),))
                    elif tag in ['ps'] and not ps:
                        if value:
                            ps = tuple(value.split('/'))
                        else:
                            ps = ()
                    elif tag in self.glossfields:
                        glossdict[tag] = value
                    elif tag in ['gv']:
                        if polisemy:
                            self._polisemy[key][select_gloss(glossdict)].append(value)
                            dk = detone(key)
                            if not dk == key:
                                self._polisemy[dk][select_gloss(glossdict)].append(value)
            else:
                process_record(lemmalist)

            if not self._dict.attributed():
                print r"Dictionary does not contain obligatory \lang, \name or \ver fields.\
                        Please specify them and try to load again."
                print self._dict.lang, self._dict.name, self._dict.ver
示例#15
0
文件: formats.py 项目: israaar/daba
    def __init__(self, filename, encoding='utf-8', store=True,
                 variants=False, polisemy=False, keepmrph=False,
                 normalize=True, ignorelist=('i',), inverse=False,
                 lemmafields=('lx', 'le', 'va', 'vc', 'a'),
                 glossfields=('gf', 'ge', 'dff'), canonical=False):

        self._dict = DabaDict()
        self._variants = VariantsDict(canonical=canonical)
        self._polisemy = defaultdict(ddlist)
        self.keepmrph = keepmrph
        self.normalize = normalize
        self.line = 0
        self.ignorelist = ignorelist
        self.inverse = inverse
        self.lemmafields = lemmafields
        self.glossfields = glossfields
        ignore = False
        lemmalist = []
        key = None
        ps = ()

        def parsemm(v):
            try:
                f, p, g = v.split(':')
                if p:
                    ps = tuple(p.split('/'))
                else:
                    ps = ()
                return Gloss(f, ps, g, ())
            except (ValueError):
                print("Error line:", str(self.line), str(v))

        def normalize(value):
            return normalizeText(value.translate({ord(u'.'): None, ord(u'-'):None}).lower())

        def make_item(value):
            if self.normalize:
                value = normalize(value)
            return [value, Gloss(form=value, ps=(), gloss="", morphemes=())]

        def push_items(primarykey, lemmalist):
            for key, lx in lemmalist:
                self._dict[key] = lx
                detonedkey = detone(key)
                if not detonedkey == key:
                    self._dict[detonedkey] = lx

        def select_gloss(glossdict):
            ge = ''
            for f in self.glossfields:
                try:
                    ge = glossdict[f]
                    break
                except KeyError:
                    pass
            return ge
                    
        def process_record(key, lemmalist):
            if lemmalist:
                ge = select_gloss(glossdict)
                if self.inverse:
                    key = u'_'.join(['/'.join(ps), ge])
                    lemmalist = [(key, g._replace(ps=ps, gloss=ge)) for k, g in lemmalist]
                    push_items(key, lemmalist)
                else:
                    lemmalist = [(key, item._replace(ps=ps, gloss=ge)) for key, item in lemmalist]
                    if not ps == ('mrph',) or self.keepmrph:
                        if store:
                            push_items(key, lemmalist)
                        if variants and len(lemmalist) > 1:
                            self._variants.add(list(zip(*lemmalist))[1])

        with codecs.open(filename, 'r', encoding=encoding) as dictfile:
            for line in dictfile:
                self.line = self.line + 1
                # end of the artice/dictionary
                if not line or line.isspace():
                    if not ignore:
                        process_record(key, lemmalist)
                    ignore = False
                    lemmalist = []
                    ps = ()
                    glossdict = {}
                    key = None
                elif line.startswith('\\'):
                    line = unicodedata.normalize('NFKD', line)
                    tag, space, value = line[1:].partition(' ')
                    value = value.strip()
                    if tag in ['lang', 'ver', 'name']:
                        self._dict.__setattr__(tag, value)
                    elif tag in self.ignorelist:
                        ignore = True
                    elif tag in self.lemmafields:
                        if self.normalize:
                            key = normalize(value)
                        else:
                            key = value
                        lemmalist.append(make_item(value))
                    elif tag in ['mm']:
                        lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),))
                    elif tag in ['ps'] and not ps:
                        if value:
                            ps = tuple(value.split('/'))
                        else:
                            ps = ()
                    elif tag in self.glossfields:
                        glossdict[tag] = value
                    elif tag in ['gv']:
                        if polisemy:
                            self._polisemy[key][select_gloss(glossdict)].append(value)
                            dk = detone(key)
                            if not dk == key:
                                self._polisemy[dk][select_gloss(glossdict)].append(value)
            else:
                process_record(key, lemmalist)

            if not self._dict.attributed():
                print(r"Dictionary does not contain obligatory \lang, \name or \ver fields.\
                        Please specify them and try to load again.")
                print(self._dict.lang, self._dict.name, self._dict.ver)
示例#16
0
文件: mparser.py 项目: israaar/daba
 def get_case(self, string):
     string = detone(string)
     if string.isupper():
         case = unicode.upper
示例#17
0
 def get_case(self, string):
     string = detone(string)
     if string[0] in [u"-‑‐–=\"'\u2018\u201c"]:
         string = string[1:]
     if string.isupper():
         case = str.upper
示例#18
0
def print_token(gt, args, vardict, polidict, get_lemma, sent=False):
    if gt.type in ['Comment', '<s>', '<p>']:
        return
    if not gt.type == "w":
        print(gt.token, end="\t")
    if gt.type == 'w':
        normalized = gt.glosslist[0].form
        if ' ' in normalized:
            words = normalized.split(' ')
            for word in words:
                gt.glosslist[0] = gt.glosslist[0]._replace(form=word)
                print_token(gt, args, vardict, polidict, get_lemma, sent=sent)
            return
        if args.convert and not args.keepsource:
            token = get_lemma(normalized)
            if args.tonal and args.conll:
                token = detone(token)
        else:
            token = gt.token
        print(token, end="\t")

        tonals = []
        fields = []
        lemmas = []
        tags = set()
        glosses = []
        igtforms = []
        igtglosses = []
        deep = []
        polisemy = []
        for g in gt.glosslist:
            tags = tags.union(g.ps)
            if re.match(r'^[A-Z0-9.]+$', g.gloss):
                gls = g.gloss
            else:
                gls = dedot(g.gloss, '_')
            if not gls and g.morphemes:
                gls = '-'.join([m.gloss for m in g.morphemes])
            if not args.nogloss:
                glosses.append(gls)
            if not args.tonal:
                if g.morphemes:
                    tonals.append(''.join([dedot(m.form)
                                           for m in g.morphemes]))
                else:
                    tonals.append(dedot(g.form))
            if g.morphemes:
                #HACK: if we have no gloss on the top, make up lemma from morphemes
                # targeted at inflected forms analyzed by the parser
                if [m for m in g.morphemes if 'mrph' not in m.ps]:
                    lemmas.append(
                        get_lemma(''.join([
                            dedot(m.form) for m in g.morphemes
                            if m.gloss not in args.flective.split(',')
                        ])))
                else:
                    lemmas.append(get_lemma(g.form))

                if args.igt:
                    if not g.gloss:
                        igtforms.append('-'.join(
                            [dedot(m.form) for m in g.morphemes]))
                        gls = []
                        for m in g.morphemes:
                            if m.gloss.isupper():
                                gls.append(m.gloss)
                            else:
                                gls.append(dedot(m.gloss, '_'))
                        igtglosses.append('-'.join(gls))
                    else:
                        igtforms.append(dedot(g.form))
                for m in g.morphemes:
                    if not args.conll:
                        # add grammatical glosses to tags
                        if m.gloss.isupper():
                            tags.add(m.gloss)
                    else:
                        glosses.append(m.gloss)
                    if 'mrph' not in m.ps:
                        deep.append(get_lemma(m.form))
                        #deep.append(m.gloss)
            else:
                if args.igt:
                    igtforms.append(dedot(g.form))
                lemmas.append(get_lemma(g.form))

            if args.variants:
                if g in vardict:
                    if args.canonical:
                        try:
                            lemmas = [get_lemma(vardict[g][0])]
                        except IndexError:
                            pass
                    else:
                        for variant in vardict[g]:
                            lemmas.append(get_lemma(variant))

            #HACK truncate extra long glosses lists
            if len(glosses) > 10:
                glosses = glosses[:10]
                glosses.append('...')

            fields = [lemmas, tags, glosses, deep]

            if args.convert:
                if args.keepsource:
                    fields.append([normalized])
                else:
                    fields.append([gt.token])

            if not args.tonal:
                fields.append(tonals)

            if args.polisemy:
                for ge, gvs in polidict[dedot(g.form)].items():
                    if dedot(ge, '_') in glosses:
                        polisemy.extend(gvs)
                fields.append(polisemy)

            if args.igt:
                fields.append(igtforms)
                fields.append(igtglosses)

            if args.debugfields:
                fields.append([make_tagstring(g) for g in gt.glosslist])

        print_fields(fields, unique=args.unique)

    else:
        nfields = 5
        if args.polisemy:
            nfields += 1
        if args.igt:
            nfields += 2
        if args.convert:
            nfields += 1
        if not args.tonal:
            nfields += 1
        if args.debugfields:
            nfields += 1

        if sent and gt.type == 'c':
            ctag = args.senttag
        else:
            ctag = gt.type
        print(u"\t".join([gt.token, ctag] + [gt.token] * (nfields - 2)))