def read_annotation_log(filename): io = flatcat.FlatcatIO(encoding='utf-8') out = [] nonwords = [] # enforces splitting of hyphens and colons fs = flatcat.flatcat.ForceSplitter(':-', None) with codecs.open(filename, 'r', encoding='utf-8') as fobj: for line in fobj: line = line.strip() parts = line.split('\t') if len(parts) < 3: print('Cant parse annotation "{}"'.format(line)) if parts[2] in ('Eval', 'Modified', 'Predicted'): analysis = io.read_annotation( parts[1], construction_sep=' ', )[0] analysis = fs.enforce_one(analysis) out.append(Annotation(parts[0], analysis)) elif parts[2] == 'Nonword': nonwords.append(parts[0]) # multiple analyses for the same surface word # are returned as separate Annotations return (out, nonwords)
def ModelTraining(segmentation_file): io = flatcat.FlatcatIO() morph_usage = flatcat.categorizationscheme.MorphUsageProperties() model = flatcat.FlatcatModel(morph_usage, corpusweight=1.0) model.add_corpus_data(io.read_segmentation_file(segmentation_file)) model.initialize_hmm() return model
def load_flatcat_model(filename): """ Loads and initializes Flatcat model from tarball archive. """ io = flatcat.FlatcatIO() model = io.read_tarball_model_file(filename) model.initialize_hmm() return model
def read_old_annotations(filename): io = flatcat.FlatcatIO(encoding='utf-8') out = [] nonwords = [] with codecs.open(filename, 'r', encoding='utf-8') as fobj: for line in fobj: line = line.strip() parts = line.split('\t') if len(parts) < 2: print('Cant parse annotation "{}"'.format(line)) if parts[1] == '!': nonwords.append(parts[0]) else: analysis = io.read_annotation( parts[1], construction_sep=' ', analysis_sep=',', ) out.append(Annotation(parts[0], analysis)) return (out, nonwords)
def fit_flatcat_model(datafile, corpusweight=1.0, randomState=None): random.seed(randomState) io = flatcat.FlatcatIO() morph_usage = flatcat.categorizationscheme.MorphUsageProperties() model = flatcat.FlatcatModel(morph_usage, corpusweight=corpusweight) model.add_corpus_data(io.read_segmentation_file(datafile)) model.initialize_hmm() # from https://github.com/aalto-speech/flatcat/blob/master/flatcat/cmd.py#L755 ts = arrow.now() model.train_batch( # Stop training if cost reduction between iterations is below this limit * #boundaries. min_iteration_cost_gain=0.0025, # Stop training if cost reduction between epochs is below this limit * #boundaries. # In semi-supervised training the cost is not monotonous between epochs, so this # limit is meaningless. min_epoch_cost_gain=None, # The number of training epochs. max_epochs=4, # Maximum number of iterations of each operation in the first epoch. max_iterations_first=1, # Maximum number of iterations of each operation in the subsequent epochs. max_iterations=1, # Maximum number of iterations of resegmentation in all epochs. max_resegment_iterations=2, # Maximum number of iterations of resegmentation in all epochs. max_shift_distance=2, # Minimum number of letters remaining in the shorter morph after a shift operation. min_shift_remainder=2) LOG.info('Final cost: {}'.format(model.get_cost())) te = arrow.now() LOG.info('Training time: {}'.format(te - ts)) return model
def main(argv): parser = get_argparser() args = parser.parse_args(argv) io = flatcat.FlatcatIO(encoding='utf-8') prev_iter = args.iteration - 1 print('Metric: {}, Next iteration: {}, Previous iteration: {}'.format( args.metric, args.iteration, prev_iter)) metric = METRICS[args.metric]() if not args.outdir[-1] == '/': args.outdir += '/' if not os.path.exists(args.outdir): os.makedirs(args.outdir) # Guessing filenames model_filename = args.model if model_filename is None: print('Trying to guess model file...') for filename in os.listdir('models'): if not filename.startswith( '{}.flatcat.{}.'.format(prev_iter, args.metric)): continue if not filename.endswith('.model.tar.gz'): continue if model_filename is not None: raise Exception( 'Both "{}" and "{}" match the model pattern'.format( model_filename, filename)) model_filename = os.path.join(args.modeldir, filename) if model_filename is None: raise Exception('Model not found') print('... guessing "{}"'.format(model_filename)) nonword_filename = args.oldnonwords if nonword_filename is None: nonword_filename = os.path.join( 'annotations', '{}.nonword.words'.format(prev_iter)) print('Nonword file not specified, guessing "{}"'.format( nonword_filename)) if not os.path.exists(nonword_filename): print('Nonword file ({}) not found, ' 'assuming all words are valid'.format( nonword_filename)) nonword_filename = None oldselected_filename = args.oldselected if oldselected_filename is None: oldselected_filename = os.path.join( 'annotations', '{}.train.{}.annotated.words'.format( prev_iter, args.metric)) print('Previously selected word file not specified, ' 'guessing "{}"'.format( oldselected_filename)) if not os.path.exists(oldselected_filename): print('Previously selected word file ({}) not found, ' 'assuming no selections have been made'.format( oldselected_filename)) oldselected_filename = None if args.oldoracle is not None: if not os.path.exists(args.oldoracle): raise Exception('Old oracle file "{}" not found'.format( args.oldoracle)) if args.overridemetric is not None: metric_out = args.overridemetric else: metric_out = args.metric selection_filename = os.path.join( args.outdir, '{}.train.{}.all.selected'.format(args.iteration, metric_out)) scores_filename = os.path.join( args.outdir, '{}.train.{}.all.scores'.format(args.iteration, metric_out)) unseen_filename = os.path.join( args.outdir, '{}.train.{}.unseen.selected'.format(args.iteration, metric_out)) prediction_filename = os.path.join( args.outdir, '{}.train.{}.unseen.predictions'.format(args.iteration, metric_out)) # load, initialize, read print('Loading model...') model = io.read_tarball_model_file(model_filename) print('...done') model.initialize_hmm() # FIXME: automate if oldselected_filename is not None: seen = set(tools.read_wordlist(oldselected_filename)) else: seen = set() if nonword_filename is not None: nonwords = tools.read_wordlist(nonword_filename) else: nonwords = [] seen.update(nonwords) if args.oldoracle is not None: oracle = set(tools.read_wordlist(args.oldoracle)) else: oracle = set() trainpool = next(tools.get_pools(['train'], args.pooldir)) # already selected words (incl nonwords) cannot be reselected trainpool = tools.filter_pool(trainpool, seen) # perform selection selector = selection.Selector( metric, model, progress=flatcat.utils._generator_progress) if args.configcorpus is not None: print('Configuring metric with "{}"'.format(args.configcorpus)) selector.configure( tools.read_wordlist(args.configcorpus), seen=seen) print('Performing ranking...') ranked = selector.rank(trainpool, seen=seen, n=args.num_annots) print('...done') # write scores (debug) selection.write_scores(ranked, scores_filename) # apply representative sampling, if needed if args.representative is not None and args.representative > 0: print('Performing representative sampling...') from morphsegannot.tools import representative truncated = [item.word for item in ranked[:args.representative]] selected = representative.representative_sampling( truncated, args.num_annots) else: selected = [item.word for item in ranked[:args.num_annots]] # write with codecs.open(selection_filename, 'w', encoding='utf-8') as selfobj: with codecs.open(unseen_filename, 'w', encoding='utf-8') as unfobj: with codecs.open(prediction_filename, 'w', encoding='utf-8') as prfobj: for word in selected: selfobj.write('{}\n'.format(word)) if word not in oracle: unfobj.write('{}\n'.format(word)) (morphs, _) = model.viterbi_segment(word) prfobj.write('{}\t{}\n'.format( word, ' + '.join(morphs)))
def save_flatcat_model(filename, model): io = flatcat.FlatcatIO() io.write_tarball_model_file(filename, model)
def main(argv): parser = get_argparser() args = parser.parse_args(argv) io = flatcat.FlatcatIO(encoding='utf-8') if not args.outdir[-1] == '/': args.outdir += '/' if not os.path.exists(args.outdir): os.makedirs(args.outdir) # single model overriding the metric-specific ones overridemodel = None if not args.overridemodel is None: print('Loading overridemodel...') overridemodel = io.read_tarball_model_file(args.overridemodel) print('...done') overridemodel.initialize_hmm() # FIXME: automate nonword_filename = os.path.join(args.annotsdir, '{}.nonword.words'.format(args.iteration)) if not os.path.exists(nonword_filename): print('No nonword file ({}), assuming all words are valid'.format( nonword_filename)) nonword_filename = None for metric_name in args.metrics: print('Metric: {}'.format(metric_name)) metric = METRICS[metric_name]() # workaround for metric needing high and low models if metric_name.startswith('alphabracket'): metric.set_models( io.read_tarball_model_file( os.path.join( args.modeldir, '{}.flatcat.{}_low.model.tar.gz'.format( args.iteration, 'alphabracket'))), io.read_tarball_model_file( os.path.join( args.modeldir, '{}.flatcat.{}_hi.model.tar.gz'.format( args.iteration, 'alphabracket')))) if args.overrideseen is None: annot_filename = os.path.join( args.annotsdir, '{}.train.{}.annotated.words'.format(args.iteration - 1, metric_name)) else: annot_filename = args.overrideseen model_filename = None for filename in os.listdir(args.modeldir): if not filename.startswith('{}.flatcat.{}.'.format( args.iteration, metric_name)): continue if not filename.endswith('.model.tar.gz'): continue if model_filename is not None: raise Exception( 'Both "{}" and "{}" match the model pattern'.format( model_filename, filename)) model_filename = os.path.join(args.modeldir, filename) if model_filename is None and args.overridemodel is None: raise Exception( 'Model for metric "{}" not found'.format(metric_name)) selection_filename = os.path.join( args.outdir, '{}.train.{}.selected'.format(args.iteration, metric_name)) scores_filename = os.path.join( args.outdir, '{}.train.{}.scores'.format(args.iteration, metric_name)) if os.path.exists(annot_filename): seen = set(tools.read_wordlist(annot_filename)) else: print('No annotations file ({})'.format(annot_filename)) seen = set() if not nonword_filename is None: nonwords = tools.read_wordlist(nonword_filename) else: nonwords = [] seen.update(nonwords) if not overridemodel is None: model = overridemodel elif metric_name.startswith('alphabracket'): model = None else: model = io.read_tarball_model_file(model_filename) model.initialize_hmm() trainpool = tools.get_pools(['train'], args.pooldir).next() # already selected words (incl nonwords) cannot be reselected trainpool = tools.filter_pool(trainpool, seen) # perform selection selector = selection.Selector( metric, model, progress=flatcat.utils._generator_progress) if args.configcorpus is not None: print('Configuring metric with "{}"'.format(args.configcorpus)) selector.configure(tools.read_wordlist(args.configcorpus), seen=seen) print('Performing ranking...') ranked = selector.rank(trainpool, seen=seen, n=args.num_annots) print('...done') # write selection.write_selected(ranked, selection_filename, args.num_annots) selection.write_scores(ranked, scores_filename)