def train( encoding: str, model_path: str, input_path: str or None, tag_order: int, emission_order: int, suff_length: int, rare_freq: int, separator: str, linesep: str, ): # todo verbose mode """Create a language model from an analysed corpora (and optionally from an existing model). It performs on the given input which can be also the stdin. :param encoding: The encoding of the corpora. If None, Python3 default will be used. :param model_path: Path of the model file. If exists, it will be improved. :param input_path: Path of the analysed corpora. If None, stdin will be used. :param tag_order: # todo :param emission_order: # todo :param suff_length: # todo :param rare_freq: # todo :param separator: The sepatator character(s) inside the token. Default/traditionally: '#'. :param linesep: The sepatator character(s) between the sentences. Default: newline. """ if input_path is not None: source = open(input_path, encoding=encoding) # todo default encoding? (a Python3 okos) else: source = sys.stdin trainer = Trainer(source, CorpusReader(StemmedTaggedTokenReader(separator, linesep))) if os.path.isfile(model_path): print("Reading model... ", file=sys.stderr) ret_model = StandardSerializer.read_model(model_path) print("Training model... ", file=sys.stderr) ret_model = trainer.train_model(ret_model) else: print("Training model... ", file=sys.stderr) ret_model = trainer.train(tag_order, emission_order, suff_length, rare_freq) print(trainer.stat.stat(ret_model), file=sys.stderr) print("Writing model... ", file=sys.stderr) StandardSerializer.write_model(ret_model, model_path) print("Done!", file=sys.stderr)
def create_tagger( model_path: str, analyser: str, no_stemming: bool, max_guessed: int, beam_log_theta: float, use_beam_search: bool, conf: Configuration, humor_path: str, lex_path: str, ) -> POSTagger: """Create a tagger object with the given properties. :param model_path: :param analyser: :param no_stemming: :param max_guessed: :param beam_log_theta: :param use_beam_search: :param conf: :param humor_path: :param lex_path: :return: a tagger object. """ if analyser == PurePos.INTEGRATED_MA: try: ma = PurePos.load_humor(humor_path + "/bin/pyhumor/__init__.py", lex_path) except FileNotFoundError: print("Humor module not found. Not using any morphological analyzer.", file=sys.stderr) ma = BaseMorphologicalAnalyser() elif analyser == PurePos.NONE_MA: ma = BaseMorphologicalAnalyser() else: print("Using morphological table at: {}.".format(analyser), file=sys.stderr) ma = MorphologicalTable(open(analyser)) print("Reading model... ", file=sys.stderr) rawmodel = StandardSerializer.read_model(model_path) print("Compiling model... ", file=sys.stderr) cmodel = rawmodel.compile(conf) suff_log_theta = math.log(10) if no_stemming: tagger = POSTagger(cmodel, ma, beam_log_theta, suff_log_theta, max_guessed, use_beam_search) else: tagger = MorphTagger(cmodel, ma, beam_log_theta, suff_log_theta, max_guessed, use_beam_search) return tagger