def lookup_gloss(self, gloss, gdict): "Gloss, Dictionary -> tuple(Gloss)" lookup_form = None try: if self.detone: bare = detone(gloss.form) if bare in gdict: lookup_form = bare else: if gloss.form in gdict: lookup_form = gloss.form else: bare = detone(gloss.form) if not gloss.form == bare and bare in gdict: lookup_form = bare if lookup_form: pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss) return tuple([dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern)]) else: return () except (KeyError, AttributeError): if gloss.form in gdict: print "PP", gloss.form, gdict[gloss.form] else: print "PN", gloss.form return ()
def lookup_gloss(self, gloss, gdict): 'Gloss, Dictionary -> tuple(Gloss)' lookup_form = None parts = None try: if self.detone: bare = detone(gloss.form) if bare in gdict: lookup_form = bare else: if gloss.form in gdict: lookup_form = gloss.form elif '-' in gloss.form: parts = gloss.form.split('-') lookup_form = ''.join(parts) else: bare = detone(gloss.form) if not gloss.form == bare and bare in gdict: lookup_form = bare if lookup_form: pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss) if parts: out = [] for dgloss in gdict[lookup_form]: if dgloss.matches(pattern) and len(dgloss.morphemes) == len(parts) and tones_match(gloss.form, dgloss.form): out.append(dgloss) return tuple(out) else: return tuple([dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern) and tones_match(gloss.form, dgloss.form)]) else: return () except (KeyError,AttributeError): if gloss.form in gdict: print 'PP', gloss.form, gdict[gloss.form] return ()
def push_items(d, l, ps=frozenset([]), ge=''): for k, i in l: lx = i._replace(ps=set([ps]),gloss=ge) d.setdefault(k,[]).append(lx) detoned = detone(k) if not detoned == k: d.setdefault(detoned,[]).append(lx)
def push_items(d, l, ps=frozenset([]), ge=''): for k, i in l: lx = i._replace(ps=set([ps]), gloss=ge) d.setdefault(k, []).append(lx) detoned = detone(k) if not detoned == k: d.setdefault(detoned, []).append(lx)
def make_lemmafunc(args): if args.tonal: get_lemma = lambda x: dedot(x) elif args.nullify: nullify_dict={u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'} def get_lemma(x): x = detone(''.join(c for c in x if c not in '.')) for source, target in nullify_dict.items(): x = x.replace(source, target) return x else: get_lemma = lambda x: detone(dedot(x)) return get_lemma
def lookup_gloss(self, gloss, gdict): 'Gloss, Dictionary -> tuple(Gloss)' lookup_form = None parts = None try: if self.detone: bare = detone(gloss.form) if bare in gdict: lookup_form = bare else: if gloss.form in gdict: lookup_form = gloss.form elif '-' in gloss.form: parts = gloss.form.split('-') lookup_form = ''.join(parts) else: bare = detone(gloss.form) if not gloss.form == bare and bare in gdict: lookup_form = bare if lookup_form: pattern = emptyGloss._replace(ps=gloss.ps, gloss=gloss.gloss) if parts: out = [] for dgloss in gdict[lookup_form]: if dgloss.matches(pattern) and len( dgloss.morphemes) == len(parts): out.append(dgloss) return tuple(out) else: return tuple([ dgloss for dgloss in gdict[lookup_form] if dgloss.matches(pattern) ]) else: return () except (KeyError, AttributeError): if gloss.form in gdict: print 'PP', gloss.form, gdict[gloss.form] return ()
def make_lemmafunc(args): if args.tonal: get_lemma = lambda x: dedot(x) elif args.nullify: nullify_dict = {u'ɔ': 'o', u'ɛ': 'e', u'ɲ': 'ny'} def get_lemma(x): x = detone(''.join(c for c in x if c not in '.')) for source, target in nullify_dict.items(): x = x.replace(source, target) return x else: get_lemma = lambda x: detone(dedot(x)) return get_lemma
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument( '-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-r', '--root', help='Corpus root dir') aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from') # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument( '-e', '--evalsize', type=int, default=10, help= 'Percent of training data with respect to training and test one (default 10)' ) aparser.add_argument( '-d', '--disambiguate', help= 'Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None) aparser.add_argument( '--select', help= 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true') aparser.add_argument('-i', '--infile', help='Input file (.html)', default=sys.stdin) aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout) aparser.add_argument( '-s', '--store', help= 'Store tagged raw data in file (.csv) for further research purpose', default=None) args = aparser.parse_args() if args.verbose: print(args) if args.learn and (args.pos or args.tone or args.gloss): if not (args.pos or args.tone or args.gloss): print('Choose pos, tone, gloss or combination of them') exit(0) print('Make list of files') allfiles = [] with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist: for line in filelist: allfiles.append(line.strip()) allsents = [] # pour le débogage # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' if args.tone: try: enc = encoder_tones() except: enc = None print(("Error : unable to initialize the tone encoder !")) print('Open files and find features / supervision tags') for infile in allfiles: if (infile): print('-', infile) sent = [] html_parser = FileParser() html_parser.read_file(os.path.join(args.root, infile)) for snum, sentence in enumerate(html_parser.glosses): for tnum, token in enumerate(sentence[2]): tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: tags = '/'.join(token.gloss.ps) wordform = detone(token.gloss.form) sent.append((wordform, tags)) elif args.tone: # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ? # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter # une réélle amélioration dans la modélisation de tonalisation. Néanmoins, # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans # les données d'entraînement et d'en observer le apport if '|' not in token.gloss.form: [codes, chunks] = enc.differential_encode( token.token, token.gloss.form) for chunk, code in zip(chunks, codes): try: sent.append((chunk, code)) except LookupError: pass """ elif args.gloss: tags += token.gloss.gloss sent.append((token.token, tags)) """ if len(sent) > 1: allsents.append(sent) sent = [] if args.verbose and args.tone: enc.report() # Constitution des ensmebles d'entraînement de d'évaluation p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print('Split the data in train (', len(train_set), ' sentences) / test (', len(eval_set), ' sentences)') print('Building classifier (CRF/NLTK)') # Initialization t1 = time.time() if args.tone: num_phases = len([False, True]) * len(mode_indicators) myzip = zipfile.ZipFile(args.learn + '.zip', 'w') else: num_phases = 1 # Training for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) else: model_name = args.learn # train_set : list(list((str,list(str)))) for sent in train_set: tokens = unzip(sent)[0] labels = unzip(sent)[1] if num_phases > 1: for lab in labels: pass labels = [ code_dispatcher(label)[phase] for label in labels ] features = [ _get_features_customised_for_tones(tokens, i) for i in range(len(tokens)) ] trainer.append(features, labels) trainer.train(model=model_name) if num_phases > 1: myzip.write(model_name) os.remove(model_name) if num_phases > 1: myzip.close() print("... done in", get_duration(t1_secs=t1, t2_secs=time.time())) # Evaluation print('Evaluating classifier') # gold_set, predicted_set : list(list((str, str))) # input_set, output_gold_set : list(list(str)) gold_set = eval_set input_set = [unzip(sent)[0] for sent in gold_set] predicted_set = [list() for sent in gold_set] if num_phases > 1: myzip = zipfile.ZipFile(args.learn + '.zip', 'r') for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq': 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) myzip.extract(model_name) else: model_name = args.learn tagger.set_model_file(model_name) for i, sent in enumerate(input_set): features = [ _get_features_customised_for_tones(sent, j) for j in range(len(sent)) ] labels = tagger._tagger.tag(features) if num_phases > 1: labels = [ code_dispatcher(label)[phase] for label in labels ] tagged_sent = list(zip(sent, labels)) if not predicted_set[i]: predicted_set[i] = tagged_sent else: sent_acc, labels_acc = unzip(predicted_set[i]) labels_acc = [ label_acc + label for label_acc, label in zip(labels_acc, labels) ] predicted_set[i] = list(zip(sent_acc, labels_acc)) if num_phases > 1: os.remove(model_name) myzip.close() # gold_tokens, predicted_tokens : list((str,str)) predicted_tokens = list(itertools.chain(*predicted_set)) if num_phases > 1: predicted_tokens = [ tuple([pair[0], code_resort(pair[1])]) for pair in predicted_tokens ] gold_tokens = list(itertools.chain(*gold_set)) # gold_tokens_eval, predicted_tokens_eval : list(str) if args.tone: gold_tokens_eval = getTag(gold_tokens) predicted_tokens_eval = getTag(predicted_tokens) else: gold_tokens_eval = gold_tokens predicted_tokens_eval = predicted_tokens if args.store and args.tone: stored_filename = args.store csv_export(enc, stored_filename, gold_tokens, predicted_tokens) print("Accuracy : {:>5.3f}".format( accuracy(gold_tokens_eval, predicted_tokens_eval))) if args.verbose and args.store: print(("Tagged result is exported in {}".format(args.store))) elif args.disambiguate and args.infile and args.outfile: # Lecture de texte en .HTML html_parser = FileParser() tagger = CRFTagger() if args.pos: try: tagger.set_model_file(args.disambiguate) except IOError: print("Error : unable to open the model {} !".format( args.infile)) exit(1) try: html_parser.read_file(args.infile) except IOError: print("Error : unable to open the input file {} !".format( args.infile)) exit(1) # Exportation du résultat de désambiguïsation en .HTML for snum, sentence in enumerate(html_parser.glosses): tokens = [token.token for token in sentence[2]] features = [ _get_features_customised_for_tones(tokens, i) for i in range(len(tokens)) ] tagger._tagger.set(features) for tnum, token in enumerate(sentence[2]): options = list() if token.value and len(token.value) > 2: for nopt, option in enumerate(token.value[2]): try: tag = option.ps[0] except IndexError: tag = '' prob = tagger._tagger.marginal(tag, tnum) options.append((prob, option)) reordered_probs, reordered_options = unzip( sorted(options, reverse=True)) if args.select: prob_max = reordered_probs[0] reordered_options = tuple([ reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max ]) html_parser.glosses[snum][1][tnum] = reordered_options elif args.tone: pass try: html_parser.write(args.outfile) except IOError: print("Error : unable to create the output file {}".format( args.outfile)) else: aparser.print_help() exit(0)
def push_items(primarykey, lemmalist): for key, lx in lemmalist: self._dict[key] = lx detonedkey = detone(key) if not detonedkey == key: self._dict[detonedkey] = lx
def __init__(self, filename, encoding='utf-8', store=True, variants=False, polisemy=False): self._dict = DabaDict() self._variants = VariantsDict() self._polisemy = defaultdict(ddlist) self.line = 0 lemmalist = [] key = None ps = () ge = '' def parsemm(v): try: f, p, g = v.split(':') if p: ps = tuple(p.split('/')) else: ps = () return Gloss(f, ps, g, ()) except (ValueError): print "Error line:", str(self.line), unicode(v).encode('utf-8') def normalize(value): return normalizeText(value.translate({ord(u'.'):None,ord(u'-'):None}).lower()) def make_item(value): return [normalize(value), Gloss(form=value,ps=(),gloss="",morphemes=())] def push_items(primarykey, lemmalist): for key, lx in lemmalist: self._dict[key] = lx detonedkey = detone(key) if not detonedkey == key: self._dict[detonedkey] = lx with codecs.open(filename, 'r', encoding=encoding) as dictfile: for line in dictfile: self.line = self.line + 1 # end of the artice/dictionary if not line or line.isspace(): lemmalist = [(key, item._replace(ps=ps,gloss=ge)) for key, item in lemmalist] if lemmalist and not ps == ('mrph',): if store: push_items(key, lemmalist) if variants and len(lemmalist) > 1: self._variants.add(zip(*lemmalist)[1]) lemmalist = [] ps = () ge = '' key = None elif line.startswith('\\'): tag, space, value = line[1:].partition(' ') value = value.strip() if tag in ['lang', 'ver', 'name']: self._dict.__setattr__(tag, value) elif tag in ['lx', 'le', 'va', 'vc']: key = normalize(value) lemmalist.append(make_item(value)) elif tag in ['mm']: lemmalist[-1][1] = lemmalist[-1][1]._replace(morphemes=lemmalist[-1][1].morphemes+(parsemm(value),)) elif tag in ['ps'] and not ps: if value: ps = tuple(value.split('/')) else: ps = () elif tag in ['gf', 'ge'] and not ge: ge = value elif tag in ['gv']: if polisemy: self._polisemy[key][ge].append(value) dk = detone(key) if not dk == key: self._polisemy[dk][ge].append(value) else: if lemmalist: if store: push_items(key, lemmalist) if variants: self._variants.add(zip(*lemmalist)[1]) if not self._dict.attributed(): print r"Dictionary does not contain obligatory \lang, \name or \ver fields.\ Please specify them and try to load again." print self._dict.lang, self._dict.name, self._dict.ver
def __init__(self, filename, encoding='utf-8', store=True, variants=False, polisemy=False): self._dict = DabaDict() self._variants = VariantsDict() self._polisemy = defaultdict(ddlist) self.line = 0 lemmalist = [] key = None ps = () ge = '' def parsemm(v): try: f, p, g = v.split(':') if p: ps = tuple(p.split('/')) else: ps = () return Gloss(f, ps, g, ()) except (ValueError): print "Error line:", str(self.line), unicode(v).encode('utf-8') def normalize(value): return normalizeText( value.translate({ ord(u'.'): None, ord(u'-'): None }).lower()) def make_item(value): return [ normalize(value), Gloss(form=value, ps=(), gloss="", morphemes=()) ] def push_items(primarykey, lemmalist): for key, lx in lemmalist: self._dict[key] = lx detonedkey = detone(key) if not detonedkey == key: self._dict[detonedkey] = lx def process_record(lemmalist): lemmalist = [(key, item._replace(ps=ps, gloss=ge)) for key, item in lemmalist] if lemmalist and not ps == ('mrph', ): if store: push_items(key, lemmalist) if variants and len(lemmalist) > 1: self._variants.add(zip(*lemmalist)[1]) with codecs.open(filename, 'r', encoding=encoding) as dictfile: for line in dictfile: self.line = self.line + 1 # end of the artice/dictionary if not line or line.isspace(): process_record(lemmalist) lemmalist = [] ps = () ge = '' key = None seengf = False seenge = False elif line.startswith('\\'): tag, space, value = line[1:].partition(' ') value = value.strip() if tag in ['lang', 'ver', 'name']: self._dict.__setattr__(tag, value) elif tag in ['lx', 'le', 'va', 'vc']: key = normalize(value) lemmalist.append(make_item(value)) elif tag in ['mm']: lemmalist[-1][1] = lemmalist[-1][1]._replace( morphemes=lemmalist[-1][1].morphemes + (parsemm(value), )) elif tag in ['ps'] and not ps: if value: ps = tuple(value.split('/')) else: ps = () elif tag in ['gf'] and not seengf: ge = value seengf = True elif tag in ['ge'] and not seenge: if not seengf: ge = value seenge = True elif tag in ['gv']: if polisemy: self._polisemy[key][ge].append(value) dk = detone(key) if not dk == key: self._polisemy[dk][ge].append(value) else: process_record(lemmalist) if not self._dict.attributed(): print r"Dictionary does not contain obligatory \lang, \name or \ver fields.\ Please specify them and try to load again." print self._dict.lang, self._dict.name, self._dict.ver
def get_lemma(x): x = detone(''.join(c for c in x if c not in '.')) for source, target in nullify_dict.items(): x = x.replace(source, target) return x
def main(): aparser = argparse.ArgumentParser(description='Lexicon printer for TreeTagger training') aparser.add_argument("-r", "--runtimedir", help="Runtime dir with binary saved dictionaries") aparser.add_argument("-t", "--tonal", action="store_true", help="Preserve tones on word forms") aparser.add_argument("-j", "--join", action="store_true", help="Join all sources") aparser.add_argument("-p", "--plain", action="store_true", help="Output plain lists of tokens") aparser.add_argument("-c", "--corpus", default=None, help="Corpus root") aparser.add_argument("-g", "--glob", default="*.pars.html", help="Filename pattern for search in the corpus dir") args = aparser.parse_args() #locale.setlocale(locale.LC_ALL, 'bm_ML') if args.join: globaldict = defaultdict(list) if args.corpus: seentokens = set() parsfiles = [] for root, dirnames, filenames in os.walk(args.corpus): for filename in fnmatch.filter(filenames, args.glob): parsfile = os.path.join(root, filename) reader = formats.HtmlReader(parsfile) lastpunct = None for token in reader: if token.type == 'w': if lastpunct: print_line(lastpunct.value, [' '.join([lastpunct.type, lastpunct.value])]) lastpunct = None form = dedot(token.glosslist[0].form).lower() if not args.tonal: form = detone(form) else: # FIXME: unsupported tonal for corpus pass if args.plain: result = make_taglist(token.glosslist) print_line(form, result) else: if form not in seentokens: result = make_taglist(token.glosslist) seentokens.add(form) if args.join: globaldict[form].extend(result) else: print_line(form, result) elif token.type == 'c': lastpunct = token elif token.type == 's': if lastpunct: print_line(lastpunct.value, [' '.join(['SENT', lastpunct.value])]) lastpunct = None if args.runtimedir: seenkeys = set() dictionary = mparser.DictLoader(runtimedir=args.runtimedir).dictionary for form in dictionary: if ' ' not in form: if not args.tonal: form = detone(form) if args.plain: for gloss in dictionary[form]: print gloss result = make_taglist([gloss], formforlemma=True) for lemma in result: print_line(form, [lemma]) else: if args.corpus and form in seentokens: continue if form not in seenkeys: glosses = dictionary[form] result = make_taglist(glosses, formforlemma=True) seenkeys.add(form) if args.join: globaldict[form].extend(result) else: print_line(form, result) if args.join: for form, result in globaldict.iteritems(): print_line(form, result)
def main(): aparser = argparse.ArgumentParser(description='Daba disambiguator') aparser.add_argument('-v', '--verbose', help='Verbose output', default=False, action='store_true') aparser.add_argument('-l', '--learn', help='Learn model from data (and save as F if provided)', default=None) aparser.add_argument('-p', '--pos', help='Prediction for POS', default=False, action='store_true') aparser.add_argument('-t', '--tone', help='Prediction for tones', default=False, action='store_true') aparser.add_argument('-r', '--root', help='Corpus root dir') aparser.add_argument('-f', '--filelist', help='Path to a list of files to learn from') # aparser.add_argument('-g', '--gloss', help='Prediction for gloses', default=False, action='store_true') aparser.add_argument('-e', '--evalsize', type=int, default=10, help='Percent of training data with respect to training and test one (default 10)') aparser.add_argument('-d', '--disambiguate', help='Use model F to disambiguate data, the gloss list will be ordered by the probability growth order', default=None) aparser.add_argument('--select', help = 'Option that will be taken into account only with the use of -d, which specifies the disambiguation modality is to select only the most likely gloss in each list.', action='store_true') aparser.add_argument('-i', '--infile' , help='Input file (.html)' , default=sys.stdin) aparser.add_argument('-o', '--outfile', help='Output file (.html)', default=sys.stdout) aparser.add_argument('-s', '--store', help='Store tagged raw data in file (.csv) for further research purpose', default=None) args = aparser.parse_args() if args.verbose: print args if args.learn and (args.pos or args.tone or args.gloss): if not (args.pos or args.tone or args.gloss): print 'Choose pos, tone, gloss or combination of them' exit(0) print 'Make list of files' allfiles = [] with codecs.open(args.filelist, 'r', encoding="utf-8") as filelist: for line in filelist: allfiles.append(line.strip()) allsents = [] # pour le débogage # allfiles = '../corbama/sisoko-daa_ka_kore.dis.html' if args.tone: try: enc = encoder_tones() except: enc = None print ("Error : unable to initialize the tone encoder !") print 'Open files and find features / supervision tags' for infile in allfiles: if(infile): print '-', infile sent = [] html_parser = FileParser() html_parser.read_file(os.path.join(args.root, infile)) for snum, sentence in enumerate(html_parser.glosses): for tnum, token in enumerate(sentence[2]): tag = '' if token.type == 'w' or token.type == 'c': tags = '' if args.pos: tags = '/'.join(token.gloss.ps).encode('utf-8') wordform = detone(token.gloss.form) sent.append((wordform, tags)) elif args.tone: # Pourquoi ne pas apprendre la forme tonale contenant une barre veticale ? # Parce que dans l'ensemble des corpus désambiguïsés, son occurrence est # au dessous de 10, ce cas de figure semble trop peu fréquent pour apporter # une réélle amélioration dans la modélisation de tonalisation. Néanmoins, # dans la conception du cadre logiciel, rien n'interdit de l'inclure dans # les données d'entraînement et d'en observer le apport if '|' not in token.gloss.form : [codes, chunks] = enc.differential_encode(token.token, token.gloss.form) for chunk, code in zip(chunks, codes) : try : sent.append((chunk, code.encode('utf-8'))) except LookupError: pass """ elif args.gloss: tags += token.gloss.gloss.encode('utf-8') sent.append((token.token, tags)) """ if len(sent) > 1: allsents.append(sent) sent = [] if args.verbose and args.tone: enc.report() # Constitution des ensmebles d'entraînement de d'évaluation p = (1 - args.evalsize / 100.0) train_set, eval_set = sampling(allsents, p) print 'Split the data in train (', len(train_set),' sentences) / test (', len(eval_set),' sentences)' print 'Building classifier (CRF/NLTK)' # Initialization t1 = time.time() if args.tone: num_phases = len([False, True]) * len(mode_indicators) myzip = zipfile.ZipFile(args.learn + '.zip', 'w') else: num_phases = 1 # Training for phase in range(num_phases): tagger = CRFTagger(verbose = args.verbose, training_opt = {'feature.minfreq' : 10}) trainer = pycrfsuite.Trainer(verbose = tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) else: model_name = args.learn # train_set : list(list((str,list(str)))) for sent in train_set: tokens = unzip(sent)[0] labels = unzip(sent)[1] if num_phases > 1: for lab in labels: pass labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels] features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))] trainer.append(features, labels) trainer.train(model = model_name) if num_phases > 1: myzip.write(model_name) os.remove(model_name) if num_phases > 1: myzip.close() print "... done in", get_duration(t1_secs=t1, t2_secs=time.time()) # Evaluation print 'Evaluating classifier' # gold_set, predicted_set : list(list((str, str))) # input_set, output_gold_set : list(list(str)) gold_set = eval_set input_set = [unzip(sent)[0] for sent in gold_set] predicted_set = [list() for sent in gold_set] if num_phases > 1: myzip = zipfile.ZipFile(args.learn + '.zip', 'r') for phase in range(num_phases): tagger = CRFTagger(verbose=args.verbose, training_opt={'feature.minfreq' : 10}) trainer = pycrfsuite.Trainer(verbose=tagger._verbose) trainer.set_params(tagger._training_options) if num_phases > 1: model_name = args.learn + '.' + str(phase) myzip.extract(model_name) else: model_name = args.learn tagger.set_model_file(model_name) for i, sent in enumerate(input_set): features = [_get_features_customised_for_tones(sent,j) for j in range(len(sent))] labels = tagger._tagger.tag(features) if num_phases > 1: labels = [code_dispatcher(label.decode('utf-8'))[phase].encode('utf-8') for label in labels] tagged_sent = list(zip(sent, labels)) if not predicted_set[i]: predicted_set[i] = tagged_sent else: sent_acc, labels_acc = unzip(predicted_set[i]) labels_acc = [label_acc + label for label_acc, label in zip(labels_acc, labels)] predicted_set[i] = list(zip(sent_acc, labels_acc)) if num_phases > 1: os.remove(model_name) myzip.close() # gold_tokens, predicted_tokens : list((str,str)) predicted_tokens = list(itertools.chain(*predicted_set)) if num_phases > 1: predicted_tokens = [ tuple([pair[0], code_resort(pair[1].decode('utf-8')).encode('utf-8')]) for pair in predicted_tokens] gold_tokens = list(itertools.chain(*gold_set)) # gold_tokens_eval, predicted_tokens_eval : list(str) if args.tone: gold_tokens_eval = getTag(gold_tokens) predicted_tokens_eval = getTag(predicted_tokens) else: gold_tokens_eval = gold_tokens predicted_tokens_eval = predicted_tokens if args.store and args.tone: stored_filename = args.store csv_export(enc, stored_filename, gold_tokens, predicted_tokens) print "Accuracy : {:>5.3f}".format(accuracy(gold_tokens_eval, predicted_tokens_eval)) if args.verbose and args.store: print ("Tagged result is exported in {}".format(args.store)) elif args.disambiguate and args.infile and args.outfile: # Lecture de texte en .HTML html_parser = FileParser() tagger = CRFTagger() if args.pos: try: tagger.set_model_file(args.disambiguate) except IOError: print "Error : unable to open the model {} !".format(args.infile) exit(1) try: html_parser.read_file(args.infile) except IOError: print "Error : unable to open the input file {} !".format(args.infile) exit(1) # Exportation du résultat de désambiguïsation en .HTML for snum, sentence in enumerate(html_parser.glosses): tokens = [token.token for token in sentence[2]] features = [_get_features_customised_for_tones(tokens, i) for i in range(len(tokens))] tagger._tagger.set(features) for tnum, token in enumerate(sentence[2]): options = list() if token.value and len(token.value) > 2: for nopt, option in enumerate(token.value[2]): try: tag = option.ps[0] except IndexError: tag = '' prob = tagger._tagger.marginal(tag, tnum) options.append((prob, option)) reordered_probs, reordered_options = unzip(sorted(options, reverse = True)) if args.select: prob_max = reordered_probs[0] reordered_options = tuple([ reordered_options[i] for i, p in enumerate(reordered_probs) if p >= prob_max]) html_parser.glosses[snum][1][tnum] = reordered_options elif args.tone: pass try: html_parser.write(args.outfile) except IOError: print "Error : unable to create the output file {}".format(args.outfile) else: aparser.print_help() exit(0)