예제 #1
0
def main():

    aparser = argparse.ArgumentParser(
        description='Stream editor for files in Daba format')
    aparser.add_argument('infile', help='Input file (.html)')
    aparser.add_argument('-o', '--outfile', help='Output file', default=None)
    aparser.add_argument('-s',
                         '--script',
                         help='File with edit commands',
                         required=True)
    aparser.add_argument('-v',
                         '--verbose',
                         help='Print info messages',
                         action='store_true')
    args = aparser.parse_args()
    if not args.outfile:
        args.outfile = args.infile
    # start processing
    if args.verbose:
        sys.stderr.write(u'Processing {0} with rules from {1}...\n'.format(
            args.infile, args.script).encode('utf-8'))
    sed = StreamEditor(verbose=args.verbose)
    script = ScriptParser(args.script)
    in_handler = formats.HtmlReader(args.infile, compatibility_mode=False)
    processed_tokens = list(sed.apply_script(script, in_handler))
    if sed.dirty:
        out_handler = formats.HtmlWriter(
            (in_handler.metadata,
             in_handler.make_compatible_glosses(processed_tokens)),
            args.outfile)
        out_handler.write()
        if args.verbose:
            sys.stderr.write(u'Written {0}\n'.format(
                args.outfile).encode('utf-8'))
예제 #2
0
def main():
    aparser = argparse.ArgumentParser(
        description='Daba suite. Metadata pretty printer.')
    aparser.add_argument('infile', help='Input file (.html)')
    aparser.add_argument('-f',
                         '--field',
                         dest='fields',
                         action='append',
                         help='Metadata field name')
    aparser.add_argument('-w',
                         '--words',
                         dest='fields',
                         action='append_const',
                         const='_auto:words',
                         help='Number of word tokens in document')
    aparser.add_argument('-s',
                         '--sentences',
                         dest='fields',
                         action='append_const',
                         const='_auto:sentences',
                         help='Number of sentences in document')
    aparser.add_argument('-p',
                         '--paragraphs',
                         dest='fields',
                         action='append_const',
                         const='_auto:paragraphs',
                         help='Number of paragraphs in document')
    aparser.add_argument('-d',
                         '--delimeter',
                         action='store',
                         default="\t",
                         help='Delimiter for output fields')
    aparser.add_argument('-a',
                         '--all',
                         action='store_true',
                         help='Print all metadata found in a file')
    args = aparser.parse_args()

    reader = formats.HtmlReader(args.infile, onlymeta=True)
    meta = defaultdict(unicode)
    for k, v in reader.metadata.items():
        meta[k] = v

    sys.stdout.write(unicode(args.infile).encode('utf-8'))

    if args.all:
        sys.stdout.write('\n')
        for name in sorted(meta.iterkeys()):
            sys.stdout.write(u"\t".join([name, meta[name]]).encode('utf-8'))
            sys.stdout.write('\n')
        sys.stdout.write('\n\n')
    else:
        sys.stdout.write(args.delimeter)
        sys.stdout.write(
            args.delimeter.join([meta[field]
                                 for field in args.fields]).encode('utf-8'))
        sys.stdout.write('\n')
예제 #3
0
def main():

    aparser = argparse.ArgumentParser(description='Fix multiple ps')
    aparser.add_argument('infile', help='Input file (.html)')
    aparser.add_argument('-o', '--outfile', help='Output file', default=None)
    args = aparser.parse_args()

    handler = formats.HtmlReader(args.infile)
    for gloss, index in handler.itergloss():
        if not gloss.gloss and len(gloss.ps) > 1 and gloss.morphemes:
            stemps = [m.ps for m in gloss.morphemes if 'mrph' not in m.ps]
            if len(stemps) == 1 and stemps[0].issubset(gloss.ps):
                print gloss, '->',
                gloss = gloss._replace(ps=stemps[0])
                print gloss
                handler.setgloss(gloss, index)

    out = formats.HtmlWriter((handler.metadata, handler.glosses), args.outfile
                             or args.infile)
    out.write()
예제 #4
0
파일: daba2shxml.py 프로젝트: eldams/daba
                for (toktype, tokvalue) in sentannot:
                    if toktype in ['w']:
                        sourceform, stage, glosslist = tokvalue
                        w = e.SubElement(annot, 'word')
                        w.tail = '\n'
                        # FIXME: format doesn't support ambiguity, always take first gloss
                        gloss = glosslist[0]
                        wtext = e.SubElement(w, 'item', {'type': 'txt', 'lang': 'bam'})
                        wtext.text = gloss.form
                        wtext.tail = '\n'
                        ms = e.SubElement(w, 'morphemes')
                        ms.tail = '\n'
                        if not gloss.ps or not gloss.gloss and gloss.morphemes:
                            for morph in gloss.morphemes:
                                morph_to_xml(ms, morph)
                        else:
                            morph_to_xml(ms, gloss)


        self.xml = root

    def write(self):
        e.ElementTree(self.xml).write(self.filename, encoding=self.encoding, xml_declaration=True)


infile = sys.argv[1]
outfile = sys.argv[2]

reader = formats.HtmlReader(infile) 
SHXmlWriter((reader.metadata, reader.glosses), outfile).write()
예제 #5
0
def main():
	
	aparser = argparse.ArgumentParser(description='Daba disambiguator')
	# aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin")
	# aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout")
	aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None)
	aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true')
	aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true')
	aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true')
	aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10)
	aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true')
	args = aparser.parse_args()

	if args.learn:

		if not args.pos or args.tone or args.gloss:
			print 'Choose pos, tone, gloss or combination of them'
			exit(0)

		print 'Make list of files'
		files1 = glob.iglob("../corbama/*/*.dis.html")
		files2 = glob.iglob("../corbama/*.dis.html")
		allfiles = ""
		for file1, file2 in zip(files1, files2):
			allfiles += file1+','+file2+','
		allsents = []

		print 'Open files and find features / supervision tags'
		for infile in allfiles.split(','):
			if(len(infile)) :
				print '-', infile
				sent = []
				in_handler = formats.HtmlReader(infile, compatibility_mode=False)
				for token in in_handler:
					tag = ''
					if token.type == 'w' or token.type == 'c':
						tags = ''
						if args.pos:
							for ps in token.gloss.ps:
								tags += ps
						if args.tone:
							tags += token.gloss.form.encode('utf-8')
						if args.gloss:
							tags += token.gloss.gloss.encode('utf-8')
						sent.append((token.token, tags))
					if token.type == 'c' and token.token in ['.', '?', '!']:
						if len(sent) > 1:
							allsents.append(sent)
						sent = []

		datalength = len(allsents)
		p = (1-args.evalsize/100.0)
		print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)'
		random.seed(123456)
		random.shuffle(allsents)
		train_set = allsents[:int(p*datalength)]
		test_set = allsents[int(p*datalength):datalength]

		print 'Building classifier (CRF/NLTK)'
		tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10})
		t1 = time.time()
		tagger.train(train_set, args.learn)
		t2 = time.time()
		texec = t2-t1
		print "... done in",  time.strftime('%H %M %S', time.localtime(texec))

		print 'Evaluating classifier'
		print tagger.evaluate(test_set)

		if args.verbose:
			print 'Compute detailed output'

	else:
		print 'USE...'
		parser.print_help()

	exit(0)
예제 #6
0
def main():
    oparser = argparse.ArgumentParser(
        description='Native Daba format to vertical format converter')
    oparser.add_argument('infile', help='Input file (.html)')
    oparser.add_argument("-t",
                         "--tonal",
                         action="store_true",
                         help="Make tonal lemmas")
    oparser.add_argument("-u",
                         "--unique",
                         action="store_true",
                         help="Print only unique lemmas and glosses")
    oparser.add_argument("-n",
                         "--nullify",
                         action="store_true",
                         help="Transliterate all non-ascii characters")
    oparser.add_argument(
        "-v",
        "--variants",
        help="Treat all variants in given dictionary as alternative lemmas")
    oparser.add_argument(
        "-p",
        "--polisemy",
        action="store_true",
        help="Show polisemy in a separate field (suggests -v)")
    oparser.add_argument(
        "-c",
        "--convert",
        action="store_true",
        help="Normalize wordform field, move source to the end")
    oparser.add_argument(
        "-k",
        "--keepsource",
        action="store_true",
        help="Keep source token at the head, to use with --convert")
    oparser.add_argument(
        "-i",
        "--igt",
        action="store_true",
        help=
        "Add morpheme-segmented form/gloss pair suited to copy as IGT examples"
    )
    oparser.add_argument("-d",
                         "--debugfields",
                         action="store_true",
                         help="Add debug fields for Jean-Jacques")
    args = oparser.parse_args()

    reader = formats.HtmlReader(args.infile.decode("utf-8"))

    if args.variants:
        vardict, polidict = VariantsLoader(args.variants).get()
    else:
        vardict = None
        polidict = None

    print "<doc ",
    print u'id="{0}"'.format(os.path.basename(
        args.infile.decode("utf-8"))).encode('utf-8'),

    metad = dict(reader.metadata)
    print u'source_type="{0}"'.format(metad.setdefault(
        'source:type', 'UNDEF')).encode('utf-8'),
    print u'source_year="{0}"'.format(metad.setdefault(
        'source:year', 'UNDEF')).encode('utf-8'),
    print u'text_translation="{0}"'.format(
        metad.setdefault('text:translation', 'UNDEF')).encode('utf-8'),
    print u'text_medium="{0}"'.format(metad.setdefault(
        'text:medium', 'UNDEF')).encode('utf-8'),
    print u'author_name="{0}"'.format(metad.setdefault(
        'author:name', 'UNDEF')).encode('utf-8'),

    try:
        genres = metad['text:genre'].split(';')
        hgenres = [g.split(' : ')[0] for g in genres] + genres
        hgenres.sort()
        metad['text:genre'] = u';'.join(hgenres)
        print u'text_genre="{0}"'.format(metad['text:genre']).encode('utf-8'),
    except (KeyError):
        print 'text_genre="UNDEF"',
    try:
        print u'text_title="{0}"'.format(metad['text:title']).encode('utf-8'),
    except (KeyError):
        print 'text_title="UNDEF"',
    print ">"

    for par in reader.glosses:
        print "<p>"
        for sent, annot in par:
            print "<s>"
            for token in annot:
                print_token(token, args, vardict, polidict,
                            make_lemmafunc(args))
            print "</s>"
        print "</p>"

    print "</doc>"
예제 #7
0
def main():
    aparser = argparse.ArgumentParser(description='Lexicon printer for TreeTagger training')
    aparser.add_argument("-r", "--runtimedir", help="Runtime dir with binary saved dictionaries")
    aparser.add_argument("-t", "--tonal", action="store_true", help="Preserve tones on word forms")
    aparser.add_argument("-j", "--join", action="store_true", help="Join all sources")
    aparser.add_argument("-p", "--plain", action="store_true", help="Output plain lists of tokens")
    aparser.add_argument("-c", "--corpus", default=None, help="Corpus root")
    aparser.add_argument("-g", "--glob", default="*.pars.html", help="Filename pattern for search in the corpus dir")
    args = aparser.parse_args()

    #locale.setlocale(locale.LC_ALL, 'bm_ML')

    if args.join:
        globaldict = defaultdict(list)

    if args.corpus:
        seentokens = set()
        parsfiles = []
        for root, dirnames, filenames in os.walk(args.corpus):
            for filename in fnmatch.filter(filenames, args.glob):
                parsfile = os.path.join(root, filename)
                reader = formats.HtmlReader(parsfile)
                lastpunct = None
                for token in reader:
                    if token.type == 'w':
                        if lastpunct:
                            print_line(lastpunct.value, [' '.join([lastpunct.type, lastpunct.value])])
                            lastpunct = None
                        form = dedot(token.glosslist[0].form).lower()
                        if not args.tonal:
                            form = detone(form)
                        else:
                            # FIXME: unsupported tonal for corpus
                            pass
                        if args.plain:
                            result = make_taglist(token.glosslist)
                            print_line(form, result)
                        else:
                            if form not in seentokens:
                                result = make_taglist(token.glosslist)
                                seentokens.add(form)
                                if args.join:
                                    globaldict[form].extend(result)
                                else:
                                    print_line(form, result)
                    elif token.type == 'c':
                        lastpunct = token
                    elif token.type == 's':
                        if lastpunct:
                            print_line(lastpunct.value, [' '.join(['SENT', lastpunct.value])])
                            lastpunct = None


    if args.runtimedir:
        seenkeys = set()
        dictionary = mparser.DictLoader(runtimedir=args.runtimedir).dictionary
        for form in dictionary:
            if ' ' not in form:
                if not args.tonal:
                    form = detone(form)
                if args.plain:
                    for gloss in dictionary[form]:
                        print gloss
                        result = make_taglist([gloss], formforlemma=True)
                        for lemma in result:
                            print_line(form, [lemma])
                else:
                    if args.corpus and form in seentokens:
                        continue
                    if form not in seenkeys:
                        glosses = dictionary[form]
                        result = make_taglist(glosses, formforlemma=True)
                        seenkeys.add(form)
                        if args.join:
                            globaldict[form].extend(result)
                        else:
                            print_line(form, result)

    if args.join:
        for form, result in globaldict.iteritems():
            print_line(form, result)
예제 #8
0
파일: htmlcat.py 프로젝트: eldams/daba
#!/usr/bin/python
# -*- coding: utf-8 -*-

# SYNOPSIS: <outfile> <infiles...>
import formats
import sys

metadata = []
para = []

for i in range(2, len(sys.argv)):
    infile = formats.HtmlReader(sys.argv[i])
    metadata = infile.metadata
    para.extend(infile.glosses)

outfile = formats.HtmlWriter((metadata, para), sys.argv[1])
outfile.write()