def boSection(self, order): f = gOpenOut(self.filename('%dbo' % (order + 1))) comment = 'This is a modfied back-off %d-gram distribution file.\n' % ( order + 1) if self.notice: comment = notice + '\n' + comment part = self.Writer(f, self.vocabulary, comment) return part
def topSection(self, order): f = gOpenOut(self.filename('%d' % (order + 1))) comment = 'This is a %d-gram model file.\n' % (order + 1) if self.notice: comment = notice + '\n' + comment part = self.Writer(f, self.vocabulary, comment) for oo in range(order): part.include(os.path.basename(self.filename('%dbo' % (oo + 1)))) return part
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter( consolidate( mergeSort([TextStorage(fname) for fname in options.read]))) else: counts = TextStorage(options.read[0]) else: print("no counts", file=sys.stderr) return if options.map_oov: if not options.vocabulary: print("you need to specify a vocabulary", file=sys.stderr) filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter(consolidate(mergeSort( [ TextStorage(fname) for fname in options.read ]))) else: counts = TextStorage(options.read[0]) else: print >> sys.stderr, 'no counts' return if options.map_oov: if not options.vocabulary: print >> sys.stderr, 'you need to specify a vocabulary' filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def makeLmWriter(options): if options.lm_format == 'arpa': fname = options.lm print >> sys.stdout, 'will write LM to', fname, '...' lm = LmArpaWriter(gOpenOut(fname), options.order - 1, notice) elif options.lm_format == 'estar': filePrefix, fileSuffix = os.path.splitext(options.lm) print >> sys.stdout, 'will write LM to %s-*%s ...' % (filePrefix, fileSuffix) lm = LmEstarWriter(filePrefix, fileSuffix, notice) else: raise ValueError(options.lm_format) return lm
def makeLmWriter(options): if options.lm_format == "arpa": fname = options.lm print("will write LM to", fname, "...", file=sys.stdout) lm = LmArpaWriter(gOpenOut(fname), options.order - 1, notice) elif options.lm_format == "estar": filePrefix, fileSuffix = os.path.splitext(options.lm) print("will write LM to %s-*%s ..." % (filePrefix, fileSuffix), file=sys.stdout) lm = LmEstarWriter(filePrefix, fileSuffix, notice) else: raise ValueError(options.lm_format) return lm
def main(options, args): import locale if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') #the encoding relates to the lexicon, not the standard IO #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout; #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr; if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options, log_stdout) translator.reportStats(log_stdout) if options.applySample: mainApply(translator, options, gOpenOut('-', options.encoding or defaultEncoding)) translator.reportStats(log_stderr) if options.applyWord: mainApplyWord(translator, options, log_stdout)
def mainTest(translator, testSample, options): if options.shouldTranspose: testSample = SequiturTool.transposeSample(testSample) if options.testResult: resultFile = gOpenOut(options.testResult, defaultEncoding) else: resultFile = None from Evaluation import Evaluator evaluator = Evaluator() evaluator.setSample(testSample) evaluator.resultFile = resultFile evaluator.verboseLog = stdout if options.test_segmental: supraSegmental = set(['.', "'", '"']) def removeSupraSegmental(phon): return filter(lambda p: p not in supraSegmental, phon) evaluator.compareFilter = removeSupraSegmental result = evaluator.evaluate(translator) print >> stdout, result
def main(options, args): # 1. load reference lexicon print('loading reference lexicon ...') lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print('loading subliminal lexicon ...') subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print('loading subliminal g2p model ...') subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print('loading g2p model ...') model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize)) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print('creating extended lexicon ...') xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(filter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print(p, file=f) else: for w in vocabulary: if w is not None: print(w, file=f) # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print('creating sequence model events ...') f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print(repr(event), '\t', count, file=f) # 6. count LM events if options.write_counts: print('creating sequence model counts ...') counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print('dumping fragments ...') f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in list(fragments.keys()): print(event, '\t', ' '.join(fragments[event]), file=f) # 8. dump modified LM training text if options.write_lm_text: print('dumping modified LM training text ...') f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print(" ".join(modWords), file=f)
def main(options, args): # 1. load reference lexicon print 'loading reference lexicon ...' lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print 'loading subliminal lexicon ...' subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print 'loading subliminal g2p model ...' subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print 'loading g2p model ...' model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print 'stripped number of multigrams from %d to %d' % (oldSize, newSize) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print 'creating extended lexicon ...' xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(ifilter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print >> f, p else: for w in vocabulary: if w is not None: print >> f, w # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print 'creating sequence model events ...' f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print >> f, repr(event), '\t', count # 6. count LM events if options.write_counts: print 'creating sequence model counts ...' counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print 'dumping fragments ...' f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in fragments.keys(): print >> f, event, '\t', ' '.join(fragments[event]) # 8. dump modified LM training text if options.write_lm_text: print 'dumping modified LM training text ...' f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print >> f, " ".join(modWords)
def set(self, other): file = gOpenOut(self.fname) self.write(file, othe, self.outputConversion) file.close()
def boSection(self, order): f = gOpenOut(self.filename('%dbo' % (order + 1))) comment = 'This is a modfied back-off %d-gram distribution file.\n' % (order + 1) if self.notice: comment = notice + '\n' + comment part = self.Writer(f, self.vocabulary, comment) return part