def __init__(self, model: CompiledModel, analyser: BaseMorphologicalAnalyser, log_theta: float, suf_theta: float, max_guessed_tags: int, use_beam_search: bool): self.model = model self.analyser = analyser if use_beam_search: # todo esetleg beam_size a parancssorból? self.decoder = BeamSearch(model, analyser, log_theta, suf_theta, max_guessed_tags) else: self.decoder = BeamedViterbi(model, analyser, log_theta, suf_theta, max_guessed_tags)
class POSTagger: @staticmethod def preprocess_sentence(sentence: list): util.analysis_queue.init(len(sentence)) ret = [] for i, word in enumerate(sentence): if AnalysisQueue.ispreanalysed(word): util.analysis_queue.add_word(word, i) ret.append(AnalysisQueue.clean(word)) else: ret.append(word) return ret def __init__(self, model: CompiledModel, analyser: BaseMorphologicalAnalyser, log_theta: float, suf_theta: float, max_guessed_tags: int, use_beam_search: bool): self.model = model self.analyser = analyser if use_beam_search: # todo esetleg beam_size a parancssorból? self.decoder = BeamSearch(model, analyser, log_theta, suf_theta, max_guessed_tags) else: self.decoder = BeamedViterbi(model, analyser, log_theta, suf_theta, max_guessed_tags) def tag_sentence(self, sentence: list, # list of strings max_res: int) -> Sentence: sentence = self.preprocess_sentence(sentence) tag_list = self.decoder.decode(sentence, max_res) return [Sentence(self.merge(sentence, tags[0]), score=tags[1]) for tags in tag_list] def merge(self, sentence: list, tags: list) -> list: vocab = self.model.data.tag_vocabulary return [Token(sentence[idx], None, vocab.word(tags[idx])) for idx in range(min(len(tags), len(sentence)))] def tag(self, source: io.TextIOWrapper, dest: io.TextIOWrapper, max_results_number: int=1): for line in source: sent_str = self.tag_and_format(line, max_results_number) print(sent_str, file=dest) def tag_and_format(self, line: str, max_res_num: int) -> str: sent_str = "" if line.strip() != "": s = self.tag_sentence(line.split(), max_res_num) sent_str = self.sentences_to_string(s, max_res_num > 1) return sent_str def sentences_to_string(self, sentences: list, show_prob: bool) -> str: return "\t".join([self.sent_to_string(s, show_prob) for s in sentences]) @staticmethod def sent_to_string(sentence: Sentence, show_prob: bool) -> str: # ret = " ".join(str(sentence)) ret = str(sentence) if show_prob: ret += "$${}$$".format(sentence.score) return ret