class Fragmentizer: def __init__(self, model): self.model = model self.translator = Translator(self.model) self.memory = dict() def addSupervised(self, lexicon=None): """ Caveat: supervised splitting might come up with graphones that are NOT present in the model g2p, because they were trimmed! Therefore this function may modify the sequitur inventory. """ segmenter = Segmenter(self.model) fragments = set() for orth, phon in lexicon: logLik, joint = segmenter.firstBestJoint(orth, phon) for fragment in joint: fragments.add(fragment) joint = [lmToken(gra, pho) for gra, pho in joint] if orth not in self.memory: self.memory[orth] = [] self.memory[orth].append(joint) oldSize, newSize = self.model.strip() print("stripped number of multigrams from %d to %d" % (oldSize, newSize)) sequitur = self.model.sequitur for gra, pho in fragments: fragment = ( sequitur.leftInventory.parse(gra), sequitur.rightInventory.parse(pho), ) sequitur.inventory.index(fragment) self.translator.setModel(self.model) def __call__(self, word): translations = [] if word in self.memory: translations = self.memory[word] else: try: logLik, joint = self.translator.firstBestJoint(word) joint = [lmToken(gra, pho) for gra, pho in joint] translations.append(joint) except Translator.TranslationFailure: print('failed to represent "%s" using graphones' % word) translations.append([word + "[UNKNOWN]"]) return translations
def main(options, args): if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=stdout) if not model: return 1 if options.testSample or options.applySample: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options) translator.reportStats(sys.stdout) if options.applySample: mainApply(translator, options) translator.reportStats(sys.stderr)
def g2pMain(options, args): import locale loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.applyWord: return g2pApplyWord(translator, options, log_stdout)
def __init__(self, dict_path=__dict_path__, model_path=__model_path__): self._dict_ = dict() dict_path = os.path.expanduser(dict_path) model_path = os.path.expanduser(model_path) self.__dict_path__ = dict_path self.__model_path__ = model_path sequitur_options = Values() sequitur_options.resume_from_checkpoint = False sequitur_options.modelFile = model_path sequitur_options.shouldRampUp = False sequitur_options.trainSample = False sequitur_options.shouldTranspose = False sequitur_options.newModelFile = False sequitur_options.shouldSelfTest = False self.__model__ = SequiturTool.procureModel(sequitur_options, None) if not self.__model__: logger.error('Can\'t load g2p model.') return None self.__model__ = Translator(self.__model__) a = open(dict_path).readlines() a = [i.strip('\n') for i in a] for i in a: i = i.split(' ') self._dict_[i[0]] = i[1:]
class Fragmentizer: def __init__(self, model): self.model = model self.translator = Translator(self.model) self.memory = dict() def addSupervised(self, lexicon=None): """ Caveat: supervised splitting might come up with graphones that are NOT present in the model g2p, because they were trimmed! Therefore this function may modify the sequitur inventory. """ segmenter = Segmenter(self.model) fragments = set() for orth, phon in lexicon: logLik, joint = segmenter.firstBestJoint(orth, phon) for fragment in joint: fragments.add(fragment) joint = [ lmToken(gra, pho) for gra, pho in joint ] if orth not in self.memory: self.memory[orth] = [] self.memory[orth].append(joint) oldSize, newSize = self.model.strip() print 'stripped number of multigrams from %d to %d' % (oldSize, newSize) sequitur = self.model.sequitur for gra, pho in fragments: fragment = ( sequitur.leftInventory.parse(gra), sequitur.rightInventory.parse(pho) ) sequitur.inventory.index(fragment) self.translator.setModel(self.model) def __call__(self, word): translations = [] if word in self.memory: translations = self.memory[word] else: try: logLik, joint = self.translator.firstBestJoint(word) joint = [ lmToken(gra, pho) for gra, pho in joint ] translations.append(joint) except Translator.TranslationFailure: print 'failed to represent "%s" using graphones' % word translations.append([word+'[UNKNOWN]']) return translations
def procureModel(self): if self.options.resume_from_checkpoint: model = ModelTemplate.resume(self.options.resume_from_checkpoint) self.sequitur = model.sequitur elif self.options.modelFile: if sys.version_info[:2] >= (3, 0): model = pickle.load(open(self.options.modelFile, 'rb'), encoding='latin1') else: try: model = pickle.load(open(self.options.modelFile, 'rb')) except ValueError: print('This error most likely occured because the loaded model was created in python3.\n', file=sys.stderr) raise self.sequitur = model.sequitur else: self.sequitur = Sequitur() model = None if self.options.shouldRampUp: model.rampUp() if self.options.trainSample: model = self.trainModel(model) if not model: print('failed to estimate or load model', file=self.log) return if not model: raise UsageError # model.sequenceModel.showMostProbable(sys.stdout, model.sequitur.symbol, limit=250) if self.options.shouldTranspose: model.transpose() if self.options.newModelFile: oldSize, newSize = model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize), file=self.log) f = open(self.options.newModelFile, 'wb') pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) f.close() del f if self.options.shouldSelfTest: print('warning: --self-test does not treat pronunciation variants correctly', file=self.log) if not self.develSample: print('error: cannot do --self-test without --devel sample', file=self.log) else: translator = Translator(model) evaluator = Evaluator() evaluator.setSample(self.develSample) evaluator.verboseLog = self.log result = evaluator.evaluate(translator) print(result, file=self.log) return model
def procureModel(self): if self.options.resume_from_checkpoint: model = ModelTemplate.resume(self.options.resume_from_checkpoint) self.sequitur = model.sequitur elif self.options.modelFile: model = pickle.load(open(self.options.modelFile, "rb")) self.sequitur = model.sequitur else: self.sequitur = Sequitur() model = None if self.options.shouldRampUp: model.rampUp() if self.options.trainSample: model = self.trainModel(model) if not model: print('failed to estimate or load model', file=self.log) return if not model: raise UsageError # model.sequenceModel.showMostProbable(sys.stdout, model.sequitur.symbol, limit=250) if self.options.shouldTranspose: model.transpose() if self.options.newModelFile: oldSize, newSize = model.strip() print('stripped number of multigrams from %d to %d' % (oldSize, newSize), file=self.log) f = open(self.options.newModelFile, 'wb') pickle.dump(model, f) f.close() del f if self.options.shouldSelfTest: print( 'warning: --self-test does not treat pronunciation variants correctly', file=self.log) if not self.develSample: print('error: cannot do --self-test without --devel sample', file=self.log) else: translator = Translator(model) evaluator = Evaluator() evaluator.setSample(self.develSample) evaluator.verboseLog = self.log result = evaluator.evaluate(translator) print(result, file=self.log) return model
def __init__(self, modelfn=SEQUITUR_MODEL): options = SeqOptionsObject() options.resume_from_checkpoint = False options.modelFile = modelfn options.shouldRampUp = False options.trainSample = None options.shouldTranspose = False options.newModelFile = None options.shouldSelfTest = False self.model = SequiturTool.procureModel(options, loadG2PSample, log=sys.stdout) self.translator = Translator(self.model)
def main(options, args): model = SequiturTool.procureModel(options, loadSample) if options.applySample: lines = gopen(options.applySample).readlines() words = Set([word for line in lines for word in line.split()]) addUnknowns(model, words) translator = Translator(model) for line in lines: left = tuple(line.split()) try: result = translator(left) print(" ".join(result)) except translator.TranslationFailure: print("<translation-failed/>")
def main(options, args): import locale if options.phoneme_to_phoneme: loadSample = loadP2PSample else: loadSample = loadG2PSample enc = locale.getpreferredencoding() if hasattr(sys.stdout, 'buffer'): log_stdout = codecs.getwriter(enc)(sys.stdout.buffer, errors='backslashreplace') else: log_stdout = codecs.getwriter(enc)(sys.stdout, errors='backslashreplace') if hasattr(sys.stderr, 'buffer'): log_stderr = codecs.getwriter(enc)(sys.stderr.buffer, errors='backslashreplace') else: log_stderr = codecs.getwriter(enc)(sys.stderr, errors='backslashreplace') #the encoding relates to the lexicon, not the standard IO #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout; #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr; if options.fakeTranslator: translator = MemoryTranslator(loadSample(options.fakeTranslator)) else: model = SequiturTool.procureModel(options, loadSample, log=log_stdout) if not model: return 1 if options.testSample or options.applySample or options.applyWord: translator = Translator(model) if options.stack_limit: translator.setStackLimit(options.stack_limit) del model if options.testSample: mainTest(translator, loadSample(options.testSample), options, log_stdout) translator.reportStats(log_stdout) if options.applySample: mainApply(translator, options, gOpenOut('-', options.encoding or defaultEncoding)) translator.reportStats(log_stderr) if options.applyWord: mainApplyWord(translator, options, log_stdout)
def __init__(self, model_path): class options(object): pass options = options() options.testSample = None options.modelFile = model_path options.trainSample = None options.encoding = 'ISO-8859-15' options.shouldInitializeWithCounts = None options.psyco = None options.stack_limit = None options.shouldTranspose = None options.applySample = 'args.txt' options.shouldRampUp = None options.resume_from_checkpoint = None options.lengthConstraints = None options.checkpoint = None options.eager_discount_adjustment = None options.fakeTranslator = None options.tempdir = None options.profile = None options.variants_number = None options.maxIterations = 100 options.testResult = None options.variants_mass = None options.shouldSuppressNewMultigrams = None options.develSample = None options.shouldWipeModel = None options.resource_usage = None options.test_segmental = None options.fixed_discount = None options.newModelFile = None options.minIterations = 20 options.shouldSelfTest = None options.viterbi = None options.shouldTestContinuously = None options.phoneme_to_phoneme = None import codecs global defaultEncoding defaultEncoding = options.encoding global stdout, stderr encoder, decoder, streamReader, streamWriter = codecs.lookup(options.encoding) stdout = streamWriter(sys.stdout) stderr = streamWriter(sys.stderr) loadSample = loadG2PSample model = SequiturTool.procureModel(options, loadSample, log=stdout) self.translator = Translator(model)
def translate(text): text = text.replace(",", " ,") text = text.replace(".", " .") text = text.replace("?", " ?") text = text.replace(":", " .") text = text.replace("\"", "") translator = Translator(g2p) phone = [] for w in text.split(" "): try: if w in [".", ",", "?"]: phone.append("sp") if w == "<sp>": phone.append("sp") else: phones = translator(w.lower()) phone.extend(phones) phone.append(" ") except Translator.TranslationFailure: pass return phone
def __init__(self, model): self.model = model self.translator = Translator(self.model) self.memory = dict()
def __setstate__(self, d): self.__dict__ = d self.translator = Translator(self.model)
try: return m[s] except: return s if __name__ == "__main__": chardictfn = sys.argv[1] datadir = sys.argv[2] altlangtags = sys.argv[3].split(",") with codecs.open(chardictfn, encoding="utf-8") as infh: chardict = dict([(line.split()[0], line.split()[1:]) for line in infh if line.strip() != ""]) translators = {} phonemaps = {} with open(os.path.join(datadir, "g2p.model.pickle")) as infh: translators[""] = Translator(pickle.load(infh)) for altlangtag in altlangtags: with open(os.path.join(datadir, "g2p.model."+altlangtag+".pickle")) as infh: translators[altlangtag] = Translator(pickle.load(infh)) with open(os.path.join(datadir, "g2p.phonemap."+altlangtag+".tsv")) as infh: fields = [line.strip().split("\t") for line in infh if line.strip()] phonemaps[altlangtag] = dict(fields) for line in sys.stdin: line = unicode(line, encoding="utf-8").strip() word = line.split("<")[0] try: pronun = chardict[word] except KeyError: try: pronun = None
except ImportError: import pickle from sequitur import Translator UNK_WORD = "<unk>" #DEMIT: centralize this at some stage if __name__ == "__main__": chardictfn = sys.argv[1] g2pmodelfn = sys.argv[2] with codecs.open(chardictfn, encoding="utf-8") as infh: chardict = dict([(line.split()[0], line.split()[1:]) for line in infh if line.strip() != ""]) with open(g2pmodelfn) as infh: g2pmodel = pickle.load(infh) translator = Translator(g2pmodel) for line in sys.stdin: line = unicode(line, encoding="utf-8").strip() word = line.split("<")[0] try: pronun = chardict[word] except KeyError: try: pronun = translator(word) if not pronun: pronun = chardict[UNK_WORD] except BaseException as e: print("FAILED WORD:", word.encode("utf-8"), file=sys.stderr) pronun = chardict[UNK_WORD]
def procureModel(self): #print self.options,type(self.options) #print self.loadSample,type(self.loadSample) #print self.log,type(self.log) if self.options.resume_from_checkpoint: model = ModelTemplate.resume(self.options.resume_from_checkpoint) self.sequitur = model.sequitur elif self.options.modelFile: #print "loading",self.options.modelFile f = open(self.options.modelFile) #print "loaded",f #print "type:",type(f) #print pickle class Model(object): pass model = pickle.load(f) #print "loaded",self.options.modelFile self.sequitur = model.sequitur else: self.sequitur = Sequitur() model = None if self.options.shouldRampUp: model.rampUp() if self.options.trainSample: model = self.trainModel(model) if not model: print >> self.log, 'failed to estimate or load model' return if not model: raise UsageError # model.sequenceModel.showMostProbable(sys.stdout, model.sequitur.symbol, limit=250) if self.options.shouldTranspose: model.transpose() if self.options.newModelFile: oldSize, newSize = model.strip() print >> self.log, 'stripped number of multigrams from %d to %d' % ( oldSize, newSize) f = open(self.options.newModelFile, 'w') pickle.dump(model, f, pickle.HIGHEST_PROTOCOL) f.close() del f if self.options.shouldSelfTest: print >> self.log, 'warning: --self-test does not treat pronunciation variants correctly' if not self.develSample: print >> self.log, 'error: cannot do --self-test without --devel sample' else: translator = Translator(model) evaluator = Evaluator() evaluator.setSample(self.develSample) evaluator.verboseLog = self.log result = evaluator.evaluate(translator) print >> self.log, result return model
def __call__(self, log, context, model): translator = Translator(model) result = self.evaluator.evaluate(translator) print('ER %s: string errors %s symbol errors %s' % (self.name, result.stringError, result.symbolError), file=log)
def __init__(self, jsmmodel, graphtranstable): self.gmap = graphtranstable self.model = jsmmodel self.translator = Translator(self.model)