def filter_by(target, source, env): """ Keep words in first vocabulary file that also occur in second vocabulary file """ with meta_open(source[0].rstr()) as ifdA, meta_open(source[1].rstr()) as ifdB: first = Vocabulary(ifdA) second = Vocabulary(ifdB) filtered = first.filter_by(second) with meta_open(target[0].rstr(), "w") as ofd: ofd.write(filtered.format()) return None
def probability_list_to_vocabulary(target, source, env): with meta_open(source[0].rstr()) as ifd: probs = ProbabilityList(ifd) with meta_open(target[0].rstr(), "w") as ofd: vocab = Vocabulary.from_set(probs.get_words()) ofd.write(vocab.format()) return None
def text_to_vocabulary(target, source, env): lower_case = len(source) == 1 or source[1].read() with meta_open(source[0].rstr()) as ifd: if lower_case: words = set(ifd.read().lower().split()) else: words = set(ifd.read().split()) vocab = Vocabulary.from_set([w for w in words if "_" not in w and not w.startswith("-") and not w.endswith("-")]) with meta_open(target[0].rstr(), "w") as ofd: ofd.write(vocab.format()) return None
def filter_words(target, source, env): """ Takes a coherent language model, pronunciation file and vocabulary file, and a second vocabulary file, and returns a coherent language model, pronunciation file and vocabulary file limited to the words in the second vocabulary file. The language model probabilities are scaled such that unigrams sum to one. *** """ with meta_open(source[0].rstr()) as voc_fd, meta_open(source[1].rstr()) as pron_fd, meta_open(source[2].rstr()) as lm_fd, meta_open(source[3].rstr()) as lim_fd: lm = Arpabo(lm_fd) pron = Pronunciations(pron_fd) voc = Vocabulary(voc_fd) lim = Vocabulary(lim_fd) logging.info("Original vocabulary: %s", voc) logging.info("Original pronunciations: %s", pron) logging.info("Original LM: %s", lm) logging.info("Limiting vocabulary: %s", lim) logging.info("Vocabulary to remove has mass: %s", lm.get_probability_of_not_words(lim.get_words())) logging.info("Vocabulary to remain has mass: %s", lm.get_probability_of_words(lim.get_words())) lm.filter_by(lim) pron.filter_by(lim) voc.filter_by(lim) logging.info("New vocabulary: %s", voc) logging.info("New pronunciations: %s", pron) logging.info("New LM: %s", lm) with meta_open(target[0].rstr(), "w") as voc_ofd, meta_open(target[1].rstr(), "w") as pron_ofd, meta_open(target[2].rstr(), "w") as lm_ofd: voc_ofd.write(voc.format()) pron_ofd.write(pron.format()) lm_ofd.write(lm.format()) return None