def apply_morfessor(target, source, env): """Applies a trained Morfessor model to an unseen word list. Sources: morfessor model file, word list file Targets: segmented word list """ parser = get_default_argparser() args = parser.parse_args([]) io = MorfessorIO(encoding=args.encoding, compound_separator=args.cseparator, atom_separator=args.separator) model = io.read_binary_model_file(source[0].rstr()) words = [] terms = {} for fname in source[1:]: try: with meta_open(fname.rstr(), enc=None) as ifd: for t in et.parse(ifd).getiterator("kw"): text = list(t.getiterator("kwtext"))[0].text words += text.strip().split() except: with meta_open(fname.rstr()) as ifd: words = [l.strip().split()[0] for l in ifd] words = set(sum([w.strip("-").split("-") for w in words if "_" not in w], [])) for w in words: toks, score = model.viterbi_segment(w) if len(toks) >= 2: toks = ["%s+" % toks[0]] + ["+%s+" % t for t in toks[1:-1]] + ["+%s" % toks[-1]] terms[w] = toks with meta_open(target[0].rstr(), "w") as ofd: ofd.write(("\n".join(sorted(["%s" % (" ".join(v)) for k, v in terms.iteritems()]))) + "\n") return None
def train_morfessor(target, source, env): """Train a Morfessor model using a word list as input. This builder is largely based on the code in the new Python version of Morfessor. Note that it prevents splitting that would create a morph composed just of non-acoustic graphemes. Sources: word list file Targets: segmented word list file, morfessor model file """ parser = get_default_argparser() args = parser.parse_args([]) dampfunc = lambda x : x if len(env.get("NON_ACOUSTIC_GRAPHEMES", [])) > 0: rx_str = "((\\S(%(ALT)s))|(^(%(ALT)s)\\S))" % {"ALT" : "|".join([unichr(int(x, base=16)) for x in env.get("NON_ACOUSTIC_GRAPHEMES")])} else: rx_str = None model = BaselineModel(forcesplit_list=env.get("FORCE_SPLIT", []), corpusweight=1.0, use_skips=False, nosplit_re=rx_str) io = MorfessorIO(encoding=args.encoding, compound_separator=args.cseparator, atom_separator=args.separator) words = {} with meta_open(source[0].rstr()) as ifd: for line in ifd: toks = line.strip().split() for word in toks[0].split("-"): if len(toks) == 1: words[word] = 1 elif len(toks) == 2: words[word] = words.get(word, 0) + int(toks[1]) else: return "malformed vocabulary line: %s" % (line.strip()) words = {w : c for w, c in words.iteritems() if not re.match(env.get("NON_WORD_PATTERN", "^$"), w)} model.load_data([(c, w, (w)) for w, c in words.iteritems()], args.freqthreshold, dampfunc, args.splitprob) algparams = () develannots = None e, c = model.train_batch(args.algorithm, algparams, develannots, args.finish_threshold, args.maxepochs) with meta_open(target[0].rstr(), "w") as ofd: for n, morphs in model.get_segmentations(): if len(morphs) >= 2: morphs = ["%s+" % morphs[0]] + ["+%s+" % t for t in morphs[1:-1]] + ["+%s" % morphs[-1]] ofd.write(" ".join(morphs) + "\n") io.write_binary_model_file(target[1].rstr(), model) return None
def __init__(self, lang, side, lowercase, model_path: str = None): self.lang = lang self.side = side self.lowercase = lowercase self.morf_model = None if model_path: log.info(f"Loading morph model from {model_path}") try: from morfessor import MorfessorIO except: log.error("Please do `pip install morfessor`") raise try: self.morf_model = MorfessorIO().read_binary_model_file( model_path) except: log.error( "If this is a py2 model, see https://github.com/aalto-speech/morfessor/issues/12" ) raise
class Preprocessor: """ Preprocessor to match the training settings of aligner used to build TTables. Accepts a sentence and converts into either tokens or morphemes optionally lowercasing """ def __init__(self, lang, side, lowercase, model_path: str = None): self.lang = lang self.side = side self.lowercase = lowercase self.morf_model = None if model_path: log.info(f"Loading morph model from {model_path}") try: from morfessor import MorfessorIO except: log.error("Please do `pip install morfessor`") raise try: self.morf_model = MorfessorIO().read_binary_model_file( model_path) except: log.error( "If this is a py2 model, see https://github.com/aalto-speech/morfessor/issues/12" ) raise @functools.lru_cache(maxsize=100_000) def morfess(self, word: str) -> List[str]: splits, score = self.morf_model.viterbi_segment(word) return splits def __call__(self, sentence: str) -> List[str]: if self.lowercase: sentence = sentence.lower() if self.morf_model: split_toks = map(self.morfess, sentence.split()) toks = [morph for tok in split_toks for morph in tok] else: toks = sentence.split() return toks