def main(): aparser = argparse.ArgumentParser( description='Stream editor for files in Daba format') aparser.add_argument('infile', help='Input file (.html)') aparser.add_argument('-o', '--outfile', help='Output file', default=None) aparser.add_argument('-s', '--script', help='File with edit commands', required=True) aparser.add_argument('-v', '--verbose', help='Print info messages', action='store_true') args = aparser.parse_args() if not args.outfile: args.outfile = args.infile # start processing if args.verbose: sys.stderr.write(u'Processing {0} with rules from {1}...\n'.format( args.infile, args.script).encode('utf-8')) sed = StreamEditor(verbose=args.verbose) script = ScriptParser(args.script) in_handler = formats.HtmlReader(args.infile, compatibility_mode=False) processed_tokens = list(sed.apply_script(script, in_handler)) if sed.dirty: out_handler = formats.HtmlWriter( (in_handler.metadata, in_handler.make_compatible_glosses(processed_tokens)), args.outfile) out_handler.write() if args.verbose: sys.stderr.write(u'Written {0}\n'.format( args.outfile).encode('utf-8'))
def main(): aparser = argparse.ArgumentParser( description='Daba suite. Metadata pretty printer.') aparser.add_argument('infile', help='Input file (.html)') aparser.add_argument('-f', '--field', dest='fields', action='append', help='Metadata field name') aparser.add_argument('-w', '--words', dest='fields', action='append_const', const='_auto:words', help='Number of word tokens in document') aparser.add_argument('-s', '--sentences', dest='fields', action='append_const', const='_auto:sentences', help='Number of sentences in document') aparser.add_argument('-p', '--paragraphs', dest='fields', action='append_const', const='_auto:paragraphs', help='Number of paragraphs in document') aparser.add_argument('-d', '--delimeter', action='store', default="\t", help='Delimiter for output fields') aparser.add_argument('-a', '--all', action='store_true', help='Print all metadata found in a file') args = aparser.parse_args() reader = formats.HtmlReader(args.infile, onlymeta=True) meta = defaultdict(unicode) for k, v in reader.metadata.items(): meta[k] = v sys.stdout.write(unicode(args.infile).encode('utf-8')) if args.all: sys.stdout.write('\n') for name in sorted(meta.iterkeys()): sys.stdout.write(u"\t".join([name, meta[name]]).encode('utf-8')) sys.stdout.write('\n') sys.stdout.write('\n\n') else: sys.stdout.write(args.delimeter) sys.stdout.write( args.delimeter.join([meta[field] for field in args.fields]).encode('utf-8')) sys.stdout.write('\n')
def main(): aparser = argparse.ArgumentParser(description='Fix multiple ps') aparser.add_argument('infile', help='Input file (.html)') aparser.add_argument('-o', '--outfile', help='Output file', default=None) args = aparser.parse_args() handler = formats.HtmlReader(args.infile) for gloss, index in handler.itergloss(): if not gloss.gloss and len(gloss.ps) > 1 and gloss.morphemes: stemps = [m.ps for m in gloss.morphemes if 'mrph' not in m.ps] if len(stemps) == 1 and stemps[0].issubset(gloss.ps): print gloss, '->', gloss = gloss._replace(ps=stemps[0]) print gloss handler.setgloss(gloss, index) out = formats.HtmlWriter((handler.metadata, handler.glosses), args.outfile or args.infile) out.write()
for (toktype, tokvalue) in sentannot: if toktype in ['w']: sourceform, stage, glosslist = tokvalue w = e.SubElement(annot, 'word') w.tail = '\n' # FIXME: format doesn't support ambiguity, always take first gloss gloss = glosslist[0] wtext = e.SubElement(w, 'item', {'type': 'txt', 'lang': 'bam'}) wtext.text = gloss.form wtext.tail = '\n' ms = e.SubElement(w, 'morphemes') ms.tail = '\n' if not gloss.ps or not gloss.gloss and gloss.morphemes: for morph in gloss.morphemes: morph_to_xml(ms, morph) else: morph_to_xml(ms, gloss) self.xml = root def write(self): e.ElementTree(self.xml).write(self.filename, encoding=self.encoding, xml_declaration=True) infile = sys.argv[1] outfile = sys.argv[2] reader = formats.HtmlReader(infile) SHXmlWriter((reader.metadata, reader.glosses), outfile).write()
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') # aparser.add_argument('-i', '--infile', help='Input file (.html)', default="sys.stdin") # aparser.add_argument('-o', '--outfile', help='Output file (.html)', default="sys.stdout") aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument('-e', '--evalsize', help='Percent of randomized data to use for evaluation (default 10)', default=10) aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') args = aparser.parse_args() if args.learn: if not args.pos or args.tone or args.gloss: print 'Choose pos, tone, gloss or combination of them' exit(0) print 'Make list of files' files1 = glob.iglob("../corbama/*/*.dis.html") files2 = glob.iglob("../corbama/*.dis.html") allfiles = "" for file1, file2 in zip(files1, files2): allfiles += file1+','+file2+',' allsents = [] print 'Open files and find features / supervision tags' for infile in allfiles.split(','): if(len(infile)) : print '-', infile sent = [] in_handler = formats.HtmlReader(infile, compatibility_mode=False) for token in in_handler: tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: for ps in token.gloss.ps: tags += ps if args.tone: tags += token.gloss.form.encode('utf-8') if args.gloss: tags += token.gloss.gloss.encode('utf-8') sent.append((token.token, tags)) if token.type == 'c' and token.token in ['.', '?', '!']: if len(sent) > 1: allsents.append(sent) sent = [] datalength = len(allsents) p = (1-args.evalsize/100.0) print 'Randomize and split the data in train (', int(p*datalength),' sentences) / test (', int(datalength-p*datalength),' sentences)' random.seed(123456) random.shuffle(allsents) train_set = allsents[:int(p*datalength)] test_set = allsents[int(p*datalength):datalength] print 'Building classifier (CRF/NLTK)' tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) t1 = time.time() tagger.train(train_set, args.learn) t2 = time.time() texec = t2-t1 print "... done in", time.strftime('%H %M %S', time.localtime(texec)) print 'Evaluating classifier' print tagger.evaluate(test_set) if args.verbose: print 'Compute detailed output' else: print 'USE...' parser.print_help() exit(0)
def main(): oparser = argparse.ArgumentParser( description='Native Daba format to vertical format converter') oparser.add_argument('infile', help='Input file (.html)') oparser.add_argument("-t", "--tonal", action="store_true", help="Make tonal lemmas") oparser.add_argument("-u", "--unique", action="store_true", help="Print only unique lemmas and glosses") oparser.add_argument("-n", "--nullify", action="store_true", help="Transliterate all non-ascii characters") oparser.add_argument( "-v", "--variants", help="Treat all variants in given dictionary as alternative lemmas") oparser.add_argument( "-p", "--polisemy", action="store_true", help="Show polisemy in a separate field (suggests -v)") oparser.add_argument( "-c", "--convert", action="store_true", help="Normalize wordform field, move source to the end") oparser.add_argument( "-k", "--keepsource", action="store_true", help="Keep source token at the head, to use with --convert") oparser.add_argument( "-i", "--igt", action="store_true", help= "Add morpheme-segmented form/gloss pair suited to copy as IGT examples" ) oparser.add_argument("-d", "--debugfields", action="store_true", help="Add debug fields for Jean-Jacques") args = oparser.parse_args() reader = formats.HtmlReader(args.infile.decode("utf-8")) if args.variants: vardict, polidict = VariantsLoader(args.variants).get() else: vardict = None polidict = None print "<doc ", print u'id="{0}"'.format(os.path.basename( args.infile.decode("utf-8"))).encode('utf-8'), metad = dict(reader.metadata) print u'source_type="{0}"'.format(metad.setdefault( 'source:type', 'UNDEF')).encode('utf-8'), print u'source_year="{0}"'.format(metad.setdefault( 'source:year', 'UNDEF')).encode('utf-8'), print u'text_translation="{0}"'.format( metad.setdefault('text:translation', 'UNDEF')).encode('utf-8'), print u'text_medium="{0}"'.format(metad.setdefault( 'text:medium', 'UNDEF')).encode('utf-8'), print u'author_name="{0}"'.format(metad.setdefault( 'author:name', 'UNDEF')).encode('utf-8'), try: genres = metad['text:genre'].split(';') hgenres = [g.split(' : ')[0] for g in genres] + genres hgenres.sort() metad['text:genre'] = u';'.join(hgenres) print u'text_genre="{0}"'.format(metad['text:genre']).encode('utf-8'), except (KeyError): print 'text_genre="UNDEF"', try: print u'text_title="{0}"'.format(metad['text:title']).encode('utf-8'), except (KeyError): print 'text_title="UNDEF"', print ">" for par in reader.glosses: print "<p>" for sent, annot in par: print "<s>" for token in annot: print_token(token, args, vardict, polidict, make_lemmafunc(args)) print "</s>" print "</p>" print "</doc>"
def main(): aparser = argparse.ArgumentParser(description='Lexicon printer for TreeTagger training') aparser.add_argument("-r", "--runtimedir", help="Runtime dir with binary saved dictionaries") aparser.add_argument("-t", "--tonal", action="store_true", help="Preserve tones on word forms") aparser.add_argument("-j", "--join", action="store_true", help="Join all sources") aparser.add_argument("-p", "--plain", action="store_true", help="Output plain lists of tokens") aparser.add_argument("-c", "--corpus", default=None, help="Corpus root") aparser.add_argument("-g", "--glob", default="*.pars.html", help="Filename pattern for search in the corpus dir") args = aparser.parse_args() #locale.setlocale(locale.LC_ALL, 'bm_ML') if args.join: globaldict = defaultdict(list) if args.corpus: seentokens = set() parsfiles = [] for root, dirnames, filenames in os.walk(args.corpus): for filename in fnmatch.filter(filenames, args.glob): parsfile = os.path.join(root, filename) reader = formats.HtmlReader(parsfile) lastpunct = None for token in reader: if token.type == 'w': if lastpunct: print_line(lastpunct.value, [' '.join([lastpunct.type, lastpunct.value])]) lastpunct = None form = dedot(token.glosslist[0].form).lower() if not args.tonal: form = detone(form) else: # FIXME: unsupported tonal for corpus pass if args.plain: result = make_taglist(token.glosslist) print_line(form, result) else: if form not in seentokens: result = make_taglist(token.glosslist) seentokens.add(form) if args.join: globaldict[form].extend(result) else: print_line(form, result) elif token.type == 'c': lastpunct = token elif token.type == 's': if lastpunct: print_line(lastpunct.value, [' '.join(['SENT', lastpunct.value])]) lastpunct = None if args.runtimedir: seenkeys = set() dictionary = mparser.DictLoader(runtimedir=args.runtimedir).dictionary for form in dictionary: if ' ' not in form: if not args.tonal: form = detone(form) if args.plain: for gloss in dictionary[form]: print gloss result = make_taglist([gloss], formforlemma=True) for lemma in result: print_line(form, [lemma]) else: if args.corpus and form in seentokens: continue if form not in seenkeys: glosses = dictionary[form] result = make_taglist(glosses, formforlemma=True) seenkeys.add(form) if args.join: globaldict[form].extend(result) else: print_line(form, result) if args.join: for form, result in globaldict.iteritems(): print_line(form, result)
#!/usr/bin/python # -*- coding: utf-8 -*- # SYNOPSIS: <outfile> <infiles...> import formats import sys metadata = [] para = [] for i in range(2, len(sys.argv)): infile = formats.HtmlReader(sys.argv[i]) metadata = infile.metadata para.extend(infile.glosses) outfile = formats.HtmlWriter((metadata, para), sys.argv[1]) outfile.write()