def evaluate_goldstandard(gs_file, trained_model_bin, feature_dict_json, affix_list): # get stored model and feature dict io = MorfessorIO(encoding='utf-8', compound_separator='\s+', atom_separator=None, lowercase=False) trained_model = io.read_binary_model_file(trained_model_bin) feature_dict = InfixerModel.get_features_dict_from_file(feature_dict_json) # eval and segment to file evaluator = InfixerEvaluation(trained_model, feature_dict, affix_list) evaluator.evaluate_model(gs_file)
def __init__(self, morfessor_model, feature_dict, affix_list): """Initialize an evaluation object with a model, feature dict, and affix list. :param morfessor_model: a trained Morfessor Baseline object :param feature_dict: the output dictionary from ModelBuilder object :param affix_list: """ # save input self._model = morfessor_model self._feature_dict = feature_dict self._affix_filter = AffixFilter(affix_list) # set up morfessor's IO class self._io_manager = MorfessorIO()
if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("-mf", required=True, help="The morfessor model to use", dest="morf_model") parser.add_argument("input", nargs=1, help="Input text file") options = parser.parse_args() # Load config parameters locals().update(config) numpy.random.seed(0) # Read in the morfessor model morf_segmenter = MorfessorIO().read_binary_model_file(options.morf_model) with codecs.open(options.input[0], 'r', 'utf-8') as f: data = f.read().split() if len(data) % seq_length > 0: data = data[:len(data) - len(data) % seq_length + 1] else: data = data[:len(data) - seq_length + 1] nsamples = len(data) // seq_length # Read in word-level data words = set(data) vocab_size = len(words) word_to_ix = {word: i for i, word in enumerate(words)} ix_to_word = {i: word for i, word in enumerate(words)}
def morfessor_main(train_files, dampening, cycle, save_file=None): """Calls an implementation of the Morfessor model. :param dampening: 'none', 'ones', or 'log' :param train_files: input files for model training :param cycle: from {'init', 'test', 'final'} :param save_file: base name of output files (if needed) :return: trained morfessor.BaselineModel """ # define input variables normally input at command line # all arguments are equal to their args.item equivalent in original # script's main() trainfiles = train_files # input files for training progress = True # show progress bar encoding = 'utf-8' # if None, tries UTF-8 and/or local encoding cseparator = '\s+' # separator for compound segmentation separator = None # separator for atom segmentation lowercase = False # makes all inputs lowercase forcesplit = ['-'] # list of chars to force a split on corpusweight = 1.0 # load annotation data for tuning the corpus weight param skips = False # use random skips for frequently seen compounds to speed up training nosplit = None # if the expression matches the two surrounding characters, do not allow splitting dampening = dampening # 'none', 'ones', or 'log' algorithm = 'recursive' # 'recursive' or 'viterbi' finish_threshold = 0.005 # train stops when the improvement of last iteration is smaller than this maxepochs = None # ceiling on number of training epochs develannots = None # boolean on whether to use dev-data file splitprob = None # initialize new words by random split using given probability epochinterval = 10000 # epoch interval for online training algparams = ( ) # set algorithm parameters; for this model, we are not using 'viterbi', nothing to set # Progress bar handling global show_progress_bar if progress: show_progress_bar = True else: show_progress_bar = False # build I/O and model io = MorfessorIO(encoding=encoding, compound_separator=cseparator, atom_separator=separator, lowercase=lowercase) model = BaselineModel(forcesplit_list=forcesplit, corpusweight=corpusweight, use_skips=skips, nosplit_re=nosplit) # Set frequency dampening function if dampening == 'none': dampfunc = None elif dampening == 'log': dampfunc = lambda x: int(round(math.log(x + 1, 2))) elif dampening == 'ones': dampfunc = lambda x: 1 else: raise ArgumentException("unknown dampening type '%s'" % dampening) # for use when building a new model or doing online training # this is the online+batch training model if len(trainfiles) > 0: time_start = time.time() data = io.read_corpus_files(trainfiles) epochs, total_cost = model.train_online(data, dampfunc, epochinterval, algorithm, algparams, splitprob, maxepochs) epochs, total_cost = model.train_batch(algorithm, algparams, develannots, finish_threshold, maxepochs) _logger.info("Epochs: %s" % epochs) time_end = time.time() _logger.info("Final cost: %s" % total_cost) _logger.info("Training time: %.3fs" % (time_end - time_start)) else: _logger.warning("No training data files specified.") # if save file is present, write binary model to file if isinstance(save_file, str): outfile_bin = save_file + cycle + "_bin" io.write_binary_model_file(outfile_bin, model) # return model object for further manipulation return model
def morfessor_main(train_files, dampening, cycle, save_file=None): """Calls an implementation of the Morfessor model. :param dampening: 'none', 'ones', or 'log' :param train_files: input files for model training :param cycle: from {'init', 'test', 'final'} :param save_file: base name of output files (if needed) :return: trained morfessor.BaselineModel """ # define input variables normally input at command line # all arguments are equal to their args.item equivalent in original # script's main() trainfiles = train_files # input files for training progress = True # show progress bar encoding = 'utf-8' # if None, tries UTF-8 and/or local encoding cseparator = '\s+' # separator for compound segmentation separator = None # separator for atom segmentation lowercase = False # makes all inputs lowercase forcesplit = ['-'] # list of chars to force a split on corpusweight = 1.0 # load annotation data for tuning the corpus weight param skips = False # use random skips for frequently seen compounds to speed up training nosplit = None # if the expression matches the two surrounding characters, do not allow splitting dampening = dampening # 'none', 'ones', or 'log' algorithm = 'recursive' # 'recursive' or 'viterbi' finish_threshold = 0.005 # train stops when the improvement of last iteration is smaller than this maxepochs = None # ceiling on number of training epochs develannots = None # boolean on whether to use dev-data file splitprob = None # initialize new words by random split using given probability epochinterval = 10000 # epoch interval for online training algparams = () # set algorithm parameters; for this model, we are not using 'viterbi', nothing to set # Progress bar handling global show_progress_bar if progress: show_progress_bar = True else: show_progress_bar = False # build I/O and model io = MorfessorIO(encoding=encoding, compound_separator=cseparator, atom_separator=separator, lowercase=lowercase) model = BaselineModel(forcesplit_list=forcesplit, corpusweight=corpusweight, use_skips=skips, nosplit_re=nosplit) # Set frequency dampening function if dampening == 'none': dampfunc = None elif dampening == 'log': dampfunc = lambda x: int(round(math.log(x + 1, 2))) elif dampening == 'ones': dampfunc = lambda x: 1 else: raise ArgumentException("unknown dampening type '%s'" % dampening) # for use when building a new model or doing online training # this is the online+batch training model if len(trainfiles) > 0: time_start = time.time() data = io.read_corpus_files(trainfiles) epochs, total_cost = model.train_online(data, dampfunc, epochinterval, algorithm, algparams, splitprob, maxepochs) epochs, total_cost = model.train_batch(algorithm, algparams, develannots, finish_threshold, maxepochs) _logger.info("Epochs: %s" % epochs) time_end = time.time() _logger.info("Final cost: %s" % total_cost) _logger.info("Training time: %.3fs" % (time_end - time_start)) else: _logger.warning("No training data files specified.") # if save file is present, write binary model to file if isinstance(save_file, str): outfile_bin = save_file + "_bin" io.write_binary_model_file(outfile_bin, model) # return model object for further manipulation return model
""" Trains a morfessor morphological segmenter for use in the morphological RNN """ from morfessor.io import MorfessorIO from morfessor.baseline import BaselineModel import argparse import subprocess import os if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--training-data", required=True, dest="training_data", help="Corpus to train Morfessor on") parser.add_argument("--output", dest="output", default="./morfessor_model", help="Output filename for the Morfessor model") options = parser.parse_args() data_reader = MorfessorIO() word_iterator = data_reader.read_corpus_file(options.training_data) model = BaselineModel() model.load_data(word_iterator, count_modifier=lambda x: 1) # Use types instead of tokens model.train_batch() data_reader.write_binary_model_file(options.output + "-" + os.path.basename(options.training_data) + ".bin", model)