示例#1
0
    def makeZeroOrder(self, allCounts):
	minCount, discount = self.parametrizeOrder(0)

	counts = sumLotsOfCounts(itertools.imap(lambda item : item[1], allCounts))
	effectiveCounts, total = self.effectiveCounts(counts, minCount, discount)
	effectiveTotal = effectiveCounts.sum()

	seenWords = set([w for w, n in effectiveCounts])
	assert self.sentenceStart not in seenWords
	unseenWords = set(self.predictedWords) - seenWords
	assert self.sentenceStart not in unseenWords
	self.log('number of unseen words', len(unseenWords))

	pZero = 1 / len(self.predictedWords)
	backOffMass = total - effectiveTotal
	nZero = backOffMass * pZero
	interpolatedCounts = []
	for predicted, effectiveCount in effectiveCounts:
	    interpolatedCounts.append((predicted, effectiveCount + nZero))
	for predicted in unseenWords:
	    interpolatedCounts.append((predicted, nZero))
	interpolatedCounts = Counts(interpolatedCounts)

	self.log('%d predicted events' % (interpolatedCounts.size))
	return [(MGram(()), (interpolatedCounts, total))]
示例#2
0
    def makeZeroOrder(self, allCounts):
        minCount, discount = self.parametrizeOrder(0)

        counts = sumLotsOfCounts(map(lambda item: item[1], allCounts))
        effectiveCounts, total = self.effectiveCounts(counts, minCount,
                                                      discount)
        effectiveTotal = effectiveCounts.sum()

        seenWords = set([w for w, n in effectiveCounts])
        assert self.sentenceStart not in seenWords
        unseenWords = set(self.predictedWords) - seenWords
        assert self.sentenceStart not in unseenWords
        self.log('number of unseen words', len(unseenWords))

        pZero = 1 / len(self.predictedWords)
        backOffMass = total - effectiveTotal
        nZero = backOffMass * pZero
        interpolatedCounts = []
        for predicted, effectiveCount in effectiveCounts:
            interpolatedCounts.append((predicted, effectiveCount + nZero))
        for predicted in unseenWords:
            interpolatedCounts.append((predicted, nZero))
        interpolatedCounts = Counts(interpolatedCounts)

        self.log('%d predicted events' % (interpolatedCounts.size))
        return [(MGram(()), (interpolatedCounts, total))]
示例#3
0
def loadP2PSample(compfname):
    fnames = compfname.split(':')
    assert len(fnames) == 2
    left = dict(loadG2PSample(fnames[0]))
    right = dict(loadG2PSample(fnames[1]))
    sample = []
    for w in set(left.keys()) & set(right.keys()):
        sample.append((left[w], right[w]))
    return sample
示例#4
0
def loadP2PSample(compfname):
    fnames = compfname.split(':')
    assert len(fnames) == 2
    left  = dict(loadG2PSample(fnames[0]))
    right = dict(loadG2PSample(fnames[1]))
    sample = []
    for w in set(left.keys()) & set(right.keys()):
        sample.append((left[w], right[w]))
    return sample
示例#5
0
 def rampUp(self):
     data = self.get()
     histories = set([history for history, predicted, score in data])
     newHistories = set()
     for history, predicted, score in data:
         if predicted is None: continue
         newHistory = history + (predicted, )
         if newHistory not in histories:
             newHistories.add(newHistory)
     for newHistory in newHistories:
         data.append((newHistory, None, 0.0))
     self.set(data)
示例#6
0
    def rampUp(self):
	data = self.get()
	histories = set([ history for history, predicted, score in data ])
	newHistories = set()
	for history, predicted, score in data:
	    if predicted is None: continue
	    newHistory = history + (predicted,)
	    if newHistory not in histories:
		newHistories.add(newHistory)
	for newHistory in newHistories:
	    data.append((newHistory, None, 0.0))
	self.set(data)
示例#7
0
    def addSupervised(self, lexicon=None):
	"""
	Caveat: supervised splitting might come up with graphones that
	are NOT present in the model g2p, because they were trimmed!
	Therefore this function may modify the sequitur inventory.
	"""
	segmenter = Segmenter(self.model)
	fragments = set()
	for orth, phon in lexicon:
	    logLik, joint = segmenter.firstBestJoint(orth, phon)
	    for fragment in joint:
		fragments.add(fragment)
	    joint = [ lmToken(gra, pho) for gra, pho in joint ]
	    if orth not in self.memory: self.memory[orth] = []
	    self.memory[orth].append(joint)

	oldSize, newSize = self.model.strip()
	print('stripped number of multigrams from %d to %d' % (oldSize, newSize))

	sequitur = self.model.sequitur
	for gra, pho in fragments:
	    fragment = ( sequitur.leftInventory.parse(gra),
			 sequitur.rightInventory.parse(pho) )
	    sequitur.inventory.index(fragment)
	self.translator.setModel(self.model)
示例#8
0
    def addSupervised(self, lexicon=None):
	"""
	Caveat: supervised splitting might come up with graphones that
	are NOT present in the model g2p, because they were trimmed!
	Therefore this function may modify the sequitur inventory.
	"""
	segmenter = Segmenter(self.model)
	fragments = set()
	for orth, phon in lexicon:
	    logLik, joint = segmenter.firstBestJoint(orth, phon)
	    for fragment in joint:
		fragments.add(fragment)
	    joint = [ lmToken(gra, pho) for gra, pho in joint ]
	    if orth not in self.memory: self.memory[orth] = []
	    self.memory[orth].append(joint)

	oldSize, newSize = self.model.strip()
	print 'stripped number of multigrams from %d to %d' % (oldSize, newSize)

	sequitur = self.model.sequitur
	for gra, pho in fragments:
	    fragment = ( sequitur.leftInventory.parse(gra),
			 sequitur.rightInventory.parse(pho) )
	    sequitur.inventory.index(fragment)
	self.translator.setModel(self.model)
示例#9
0
class EventGenerator:
    specialEvents = set([
	'<s>', '</s>' ])

    def __init__(self, knownWords, fragmentizer, order):
	self.knownWords = set(knownWords)
	self.fragmentizer = fragmentizer
	self.order = order
	self.rotor = RotatingDict()

    def fragmentize(self, word):
	if word not in self.rotor:
	    self.rotor[word] = tuple(self.fragmentizer(word))
	return self.rotor[word]

    def frobnicate(self, rawWords):
	raise NotImplementedError

    def __call__(self, source):
	for line in source:
	    words = line.split()
	    if words[0] != '<s>':
		assert words[-1] != '</s>'
		words = ['<s>'] + words + ['</s>']
	    for event in self.frobnicate(words):
		yield event, 1
示例#10
0
 def setVocabulary(self, vocabulary):
     self.vocabulary = vocabulary
     self.sentenceStart = vocabulary.index('<s>')
     predictedWords = set(self.vocabulary.indices())
     predictedWords.remove(self.sentenceStart)
     predictedWords.remove(self.vocabulary.noneIndex)
     self.predictedWords = list(predictedWords)
     self.predictedWords.sort()
示例#11
0
 def masterSequenceModel(self, model):
     allHistories = set()
     for history, predicted, score in model.sequenceModel.get():
         allHistories.add(history)
     result = SequenceModel.SequenceModel()
     result.setInitAndTerm(self.sequitur.term, self.sequitur.term)
     result.set([ (history, None, 0.0) for history in allHistories ])
     return result
示例#12
0
 def masterSequenceModel(self, model):
     allHistories = set()
     for history, predicted, score in model.sequenceModel.get():
         allHistories.add(history)
     result = SequenceModel.SequenceModel()
     result.setInitAndTerm(self.sequitur.term, self.sequitur.term)
     result.set([(history, None, 0.0) for history in allHistories])
     return result
示例#13
0
    def setVocabulary(self, vocabulary):
	self.vocabulary = vocabulary
	self.sentenceStart = vocabulary.index('<s>')
	predictedWords = set(self.vocabulary.indices())
	predictedWords.remove(self.sentenceStart)
	predictedWords.remove(self.vocabulary.noneIndex)
	self.predictedWords =  list(predictedWords)
	self.predictedWords.sort()
示例#14
0
 def wipeOut(self, vocabularySize):
     histories = set()
     for history, predicted, score in self.get():
         histories.add(history)
     histories.remove(())
     data = [((), None, math.log(vocabularySize))]
     for history in histories:
         data.append((history, None, 0.0))
     self.set(data)
示例#15
0
    def wipeOut(self, vocabularySize):
	histories = set()
	for history, predicted, score in self.get():
	    histories.add(history)
	histories.remove(())
	data = [((), None, math.log(vocabularySize))]
	for history in histories:
	    data.append((history, None, 0.0))
	self.set(data)
示例#16
0
    def rampUp(self):
	newHistories = set()
	for (history, predicted), probability in self.prob.iteritems():
	    if predicted is None: continue
	    newHistory = history + (predicted,)
	    if (newHistory, None) not in self.prob:
		newHistories.add(newHistory)
	for newHistory in newHistories:
	    self.prob[(newHistory, None)] = 1.0
	self.compiled = None
示例#17
0
 def rampUp(self):
     newHistories = set()
     for (history, predicted), probability in self.prob.items():
         if predicted is None: continue
         newHistory = history + (predicted, )
         if (newHistory, None) not in self.prob:
             newHistories.add(newHistory)
     for newHistory in newHistories:
         self.prob[(newHistory, None)] = 1.0
     self.compiled = None
示例#18
0
def mainTest(translator, testSample, options):
    if options.shouldTranspose:
        testSample = SequiturTool.transposeSample(testSample)
    if options.testResult:
        resultFile = gOpenOut(options.testResult, defaultEncoding)
    else:
        resultFile = None
    from Evaluation import Evaluator
    evaluator = Evaluator()
    evaluator.setSample(testSample)
    evaluator.resultFile = resultFile
    evaluator.verboseLog = stdout
    if options.test_segmental:
        supraSegmental = set(['.', "'", '"'])
        def removeSupraSegmental(phon):
            return filter(lambda p: p not in supraSegmental, phon)
        evaluator.compareFilter = removeSupraSegmental
    result = evaluator.evaluate(translator)
    print >> stdout, result
示例#19
0
class OovFragmentGenerator:
    specialEvents = set(["<s>", "</s>"])

    def __init__(self, knownWords, fragmentizer):
        self.knownWords = set(knownWords)
        self.fragmentizer = fragmentizer
        self.rotor = RotatingDict()
        self.fragmentDict = {}

    def fragmentize(self, word):
        if word not in self.rotor:
            self.rotor[word] = tuple(self.fragmentizer(word))
        return self.rotor[word]

    def __call__(self, source):
        for line in source:
            words = line.split()
            self.frobnicate(words)
        return self.fragmentDict

    def frobnicate(self, rawWords):
        for w in rawWords:
            if w in self.knownWords:
                continue
            if w in self.specialEvents:
                continue
            if w in self.fragmentDict.keys():
                continue
            fragments = self.fragmentize(w)
            self.fragmentDict[w] = fragments

    def modifyLmText(self, rawWords):
        modWords = []
        for w in rawWords:
            if w in self.knownWords:
                modWords.append(w)
            elif w in self.specialEvents:
                modWords.append(w)
            else:
                fragments = self.fragmentize(w)
                modWords.append(" ".join(fragments))
        return modWords
示例#20
0
def main(options, args):
    # 1. load reference lexicon
    print 'loading reference lexicon ...'
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print 'loading subliminal lexicon ...'
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print 'loading subliminal g2p model ...'
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print 'loading g2p model ...'
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print 'stripped number of multigrams from %d to %d' % (oldSize, newSize)

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print 'creating extended lexicon ...'
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(ifilter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print >> f, p
	else:
	    for w in vocabulary:
		if w is not None:
		    print >> f, w

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print 'creating sequence model events ...'
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print >> f, repr(event), '\t', count

    # 6. count LM events
    if options.write_counts:
	print 'creating sequence model counts ...'
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print 'dumping fragments ...'
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in fragments.keys():
            print >> f, event, '\t', ' '.join(fragments[event])

    # 8. dump modified LM training text
    if options.write_lm_text:
        print 'dumping modified LM training text ...'
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print >> f, " ".join(modWords)
示例#21
0
 def sizeTemplates(self):
     result = set()
     for i in range(1, self.size() + 1):
         left, right = self.symbol(i)
         result.add((len(left), len(right)))
     return sorted(result)
示例#22
0
def main(options, args):
    # 1. load reference lexicon
    print('loading reference lexicon ...')
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print('loading subliminal lexicon ...')
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print('loading subliminal g2p model ...')
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print('loading g2p model ...')
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print('stripped number of multigrams from %d to %d' % (oldSize, newSize))

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print('creating extended lexicon ...')
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(filter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print(p, file=f)
	else:
	    for w in vocabulary:
		if w is not None:
		    print(w, file=f)

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print('creating sequence model events ...')
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print(repr(event), '\t', count, file=f)

    # 6. count LM events
    if options.write_counts:
	print('creating sequence model counts ...')
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print('dumping fragments ...')
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in list(fragments.keys()):
            print(event, '\t', ' '.join(fragments[event]), file=f)

    # 8. dump modified LM training text
    if options.write_lm_text:
        print('dumping modified LM training text ...')
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print(" ".join(modWords), file=f)
示例#23
0
 def __init__(self, knownWords, fragmentizer):
     self.knownWords = set(knownWords)
     self.fragmentizer = fragmentizer
     self.rotor = RotatingDict()
     self.fragmentDict = {}
示例#24
0
import sys
import codecs
import cPickle as pickle
from elementtree.ElementTree import ElementTree, Element, Comment, SubElement
from itertools import ifilter, starmap
import mGramCounts
from sequitur import Segmenter, Translator
from g2p import loadBlissLexicon
from misc import gOpenIn, gOpenOut, set, reversed

# ===========================================================================
nonLmTokens = set("""
"QUOTE
"UNQUOTE
"BEGIN-QUOTE
"END-QUOTE
%PERCENT
.POINT
/SLASH
""".split())

def isLmToken(word):
    return word not in nonLmTokens

# ===========================================================================
def lmToken(letters, phonemes):
    return '*' + ''.join(letters) + ':' + '_'.join(phonemes) + '*'

def addGraphonesToLexicon(xml, graphones):
    lexicon = xml.getroot()
    for letters, phonemes in graphones:
示例#25
0
 def __init__(self, knownWords, fragmentizer):
     self.knownWords = set(knownWords)
     self.fragmentizer = fragmentizer
     self.rotor = RotatingDict()
     self.fragmentDict = {}
示例#26
0
import sys
import codecs
import pickle as pickle
from elementtree.ElementTree import ElementTree, Element, Comment, SubElement
from itertools import starmap
import mGramCounts
from sequitur import Segmenter, Translator
from g2p import loadBlissLexicon
from misc import gOpenIn, gOpenOut, set, reversed

# ===========================================================================
nonLmTokens = set("""
"QUOTE
"UNQUOTE
"BEGIN-QUOTE
"END-QUOTE
%PERCENT
.POINT
/SLASH
""".split())

def isLmToken(word):
    return word not in nonLmTokens

# ===========================================================================
def lmToken(letters, phonemes):
    return '*' + ''.join(letters) + ':' + '_'.join(phonemes) + '*'

def addGraphonesToLexicon(xml, graphones):
    lexicon = xml.getroot()
    for letters, phonemes in graphones:
示例#27
0
 def sizeTemplates(self):
     result = set()
     for i in range(1, self.size() + 1):
         left, right = self.symbol(i)
         result.add((len(left), len(right)))
     return sorted(result)