示例#1
0
    def train(
        encoding: str,
        model_path: str,
        input_path: str or None,
        tag_order: int,
        emission_order: int,
        suff_length: int,
        rare_freq: int,
        separator: str,
        linesep: str,
    ):  # todo verbose mode
        """Create a language model from an analysed corpora (and optionally from an existing model).
        It performs on the given input which can be also the stdin.

        :param encoding: The encoding of the corpora. If None, Python3 default will be used.
        :param model_path: Path of the model file. If exists, it will be improved.
        :param input_path: Path of the analysed corpora. If None, stdin will be used.
        :param tag_order:  # todo
        :param emission_order:  # todo
        :param suff_length:  # todo
        :param rare_freq:  # todo
        :param separator: The sepatator character(s) inside the token. Default/traditionally: '#'.
        :param linesep: The sepatator character(s) between the sentences. Default: newline.
        """
        if input_path is not None:
            source = open(input_path, encoding=encoding)  # todo default encoding? (a Python3 okos)
        else:
            source = sys.stdin
        trainer = Trainer(source, CorpusReader(StemmedTaggedTokenReader(separator, linesep)))

        if os.path.isfile(model_path):
            print("Reading model... ", file=sys.stderr)
            ret_model = StandardSerializer.read_model(model_path)
            print("Training model... ", file=sys.stderr)
            ret_model = trainer.train_model(ret_model)
        else:
            print("Training model... ", file=sys.stderr)
            ret_model = trainer.train(tag_order, emission_order, suff_length, rare_freq)
        print(trainer.stat.stat(ret_model), file=sys.stderr)
        print("Writing model... ", file=sys.stderr)
        StandardSerializer.write_model(ret_model, model_path)
        print("Done!", file=sys.stderr)
示例#2
0
    def create_tagger(
        model_path: str,
        analyser: str,
        no_stemming: bool,
        max_guessed: int,
        beam_log_theta: float,
        use_beam_search: bool,
        conf: Configuration,
        humor_path: str,
        lex_path: str,
    ) -> POSTagger:
        """Create a tagger object with the given properties.

        :param model_path:
        :param analyser:
        :param no_stemming:
        :param max_guessed:
        :param beam_log_theta:
        :param use_beam_search:
        :param conf:
        :param humor_path:
        :param lex_path:
        :return: a tagger object.
        """
        if analyser == PurePos.INTEGRATED_MA:
            try:
                ma = PurePos.load_humor(humor_path + "/bin/pyhumor/__init__.py", lex_path)
            except FileNotFoundError:
                print("Humor module not found. Not using any morphological analyzer.", file=sys.stderr)
                ma = BaseMorphologicalAnalyser()
        elif analyser == PurePos.NONE_MA:
            ma = BaseMorphologicalAnalyser()
        else:
            print("Using morphological table at: {}.".format(analyser), file=sys.stderr)
            ma = MorphologicalTable(open(analyser))
        print("Reading model... ", file=sys.stderr)
        rawmodel = StandardSerializer.read_model(model_path)
        print("Compiling model... ", file=sys.stderr)
        cmodel = rawmodel.compile(conf)
        suff_log_theta = math.log(10)
        if no_stemming:
            tagger = POSTagger(cmodel, ma, beam_log_theta, suff_log_theta, max_guessed, use_beam_search)
        else:
            tagger = MorphTagger(cmodel, ma, beam_log_theta, suff_log_theta, max_guessed, use_beam_search)
        return tagger