def main(options, args): # 1. load reference lexicon print('loading reference lexicon ...') lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print('loading subliminal lexicon ...') subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print('loading subliminal g2p model ...') subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print('loading g2p model ...') model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize)) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print('creating extended lexicon ...') xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(filter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print(p, file=f) else: for w in vocabulary: if w is not None: print(w, file=f) # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print('creating sequence model events ...') f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print(repr(event), '\t', count, file=f) # 6. count LM events if options.write_counts: print('creating sequence model counts ...') counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print('dumping fragments ...') f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in list(fragments.keys()): print(event, '\t', ' '.join(fragments[event]), file=f) # 8. dump modified LM training text if options.write_lm_text: print('dumping modified LM training text ...') f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print(" ".join(modWords), file=f)
def main(options, args): # 1. load reference lexicon print 'loading reference lexicon ...' lexicon = loadBlissLexicon(options.lexicon) knownWords = set([ orth for orth, phon in lexicon ]) # 2. load model for fragmentizing unknown words if options.subliminal_lexicon: print 'loading subliminal lexicon ...' subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon) else: subliminalLexicon = None if options.subliminal_g2p: print 'loading subliminal g2p model ...' subliminalG2p = pickle.load(open(options.subliminal_g2p)) else: subliminalG2p = None if options.g2pModel: print 'loading g2p model ...' model = pickle.load(open(options.g2pModel)) oldSize, newSize = model.strip() print 'stripped number of multigrams from %d to %d' % (oldSize, newSize) fragmentizer = Fragmentizer(model) if subliminalLexicon: fragmentizer.addSupervised(subliminalLexicon) if subliminalG2p: fragmentizer.addSupervised(subliminalG2p) graphones = model.sequitur.symbols() graphones.remove(model.sequitur.symbol(model.sequitur.term)) else: model = fragmentizer = graphones = None # 3. add fragments to lexicon if options.write_lexicon: print 'creating extended lexicon ...' xmlLexicon = ElementTree(file = options.lexicon) if options.model_type == 'phonemes': changeSyntaticToPhonetic(xmlLexicon) else: addGraphonesToLexicon(xmlLexicon, graphones) xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding) # 4. determine set of LM tokens vocabulary = mGramCounts.ClosedVocablary() vocabulary.add(['<s>', '</s>']) if options.model_type == 'flat-hybrid': vocabulary.add(ifilter(isLmToken, knownWords), soft=True) if graphones: vocabulary.add(starmap(lmToken, graphones)) vocabulary.sort() if options.write_tokens: f = gOpenOut(options.write_tokens, defaultEncoding) if options.model_type == 'phonemes': phonemes = set(p for orth, phon in lexicon for p in phon) phonemes.add('#1') if 'si' in phonemes: phonemes.remove('si') for p in sorted(phonemes): print >> f, p else: for w in vocabulary: if w is not None: print >> f, w # 5./6. set-up LM event generator if options.write_counts or options.write_events: order = options.order - 1 if options.model_type == 'flat-hybrid': events = HybridEventGenerator(knownWords, fragmentizer, order) if options.range_type == 'fragments': events.setFragmentRange() elif options.range_type == 'words': events.setTrueWordRange() else: assert ValueError(options.range_type) elif options.model_type == 'fragments': events = OovEventGenerator(knownWords, fragmentizer, order) elif options.model_type == 'phonemes': events = PhonemeEventGenerator(lexicon, order) # 5. create modified LM training corpus counts if options.write_events: print 'creating sequence model events ...' f = gOpenOut(options.write_events, defaultEncoding) for event, count in events(gOpenIn(options.text, defaultEncoding)): print >> f, repr(event), '\t', count # 6. count LM events if options.write_counts: print 'creating sequence model counts ...' counts = mGramCounts.SimpleMultifileStorage() counts.addIter(events(gOpenIn(options.text, defaultEncoding))) mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts) # 7. dump list of OOV words and their corresponding fragmentation if options.write_fragments: print 'dumping fragments ...' f = gOpenOut(options.write_fragments, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) fragments = events(gOpenIn(options.text, defaultEncoding)) for event in fragments.keys(): print >> f, event, '\t', ' '.join(fragments[event]) # 8. dump modified LM training text if options.write_lm_text: print 'dumping modified LM training text ...' f = gOpenOut(options.write_lm_text, defaultEncoding) events = OovFragmentGenerator(knownWords, fragmentizer) for line in gOpenIn(options.text, defaultEncoding): words = line.split() modWords = events.modifyLmText(words) print >> f, " ".join(modWords)