예제 #1
0
 def boSection(self, order):
     f = gOpenOut(self.filename('%dbo' % (order + 1)))
     comment = 'This is a modfied back-off %d-gram distribution file.\n' % (
         order + 1)
     if self.notice: comment = notice + '\n' + comment
     part = self.Writer(f, self.vocabulary, comment)
     return part
예제 #2
0
    def topSection(self, order):
	f = gOpenOut(self.filename('%d' % (order + 1)))
	comment = 'This is a %d-gram model file.\n' % (order + 1)
	if self.notice: comment = notice + '\n' + comment
	part = self.Writer(f, self.vocabulary, comment)
	for oo in range(order):
	    part.include(os.path.basename(self.filename('%dbo' % (oo + 1))))
	return part
예제 #3
0
 def topSection(self, order):
     f = gOpenOut(self.filename('%d' % (order + 1)))
     comment = 'This is a %d-gram model file.\n' % (order + 1)
     if self.notice: comment = notice + '\n' + comment
     part = self.Writer(f, self.vocabulary, comment)
     for oo in range(order):
         part.include(os.path.basename(self.filename('%dbo' % (oo + 1))))
     return part
예제 #4
0
def main(options, args):
    if options.vocabulary:
        vocabulary = loadVocabulary(options.vocabulary)
    else:
        vocabulary = OpenVocabulary()

    if options.text:
        text = misc.gOpenIn(options.text)
        sentences = itertools.imap(str.split, text)
        sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
        grams = mGramsChainCount(sentences, options.order - 1)
        counts = createStorage(options)
        counts.addIter(grams)
    elif options.read:
        if len(options.read) > 1:
            counts = createStorage(options)
            counts.addIter(
                consolidate(
                    mergeSort([TextStorage(fname) for fname in options.read])))
        else:
            counts = TextStorage(options.read[0])
    else:
        print("no counts", file=sys.stderr)
        return

    if options.map_oov:
        if not options.vocabulary:
            print("you need to specify a vocabulary", file=sys.stderr)
        filt = MapUnknownsFilter(counts, vocabulary.list,
                                 vocabulary.unknownSymbol)
        mappedCounts = createStorage(options)
        mappedCounts.addIter(filt.rawIter())
        counts = mappedCounts

    if options.write:
        countFile = misc.gOpenOut(options.write)
        TextStorage.write(countFile, counts)

    if options.counts_of_counts:
        coc = [
            countsOfCounts(mGramReduceToOrder(counts, order))
            for order in range(options.order)
        ]
        import pprint

        pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
예제 #5
0
def main(options, args):
    if options.vocabulary:
	vocabulary = loadVocabulary(options.vocabulary)
    else:
	vocabulary = OpenVocabulary()

    if options.text:
	text = misc.gOpenIn(options.text)
	sentences = itertools.imap(str.split, text)
	sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
	grams = mGramsChainCount(sentences, options.order - 1)
	counts = createStorage(options)
	counts.addIter(grams)
    elif options.read:
	if len(options.read) > 1:
	    counts = createStorage(options)
	    counts.addIter(consolidate(mergeSort(
		[ TextStorage(fname) for fname in options.read ])))
	else:
	    counts = TextStorage(options.read[0])
    else:
	print >> sys.stderr, 'no counts'
	return

    if options.map_oov:
	if not options.vocabulary:
	    print >> sys.stderr, 'you need to specify a vocabulary'
	filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol)
	mappedCounts = createStorage(options)
	mappedCounts.addIter(filt.rawIter())
	counts = mappedCounts

    if options.write:
	countFile = misc.gOpenOut(options.write)
	TextStorage.write(countFile, counts)

    if options.counts_of_counts:
	coc = [ countsOfCounts(mGramReduceToOrder(counts, order))
		for order in range(options.order) ]
	import pprint
	pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
예제 #6
0
def makeLmWriter(options):
    if options.lm_format == 'arpa':
	fname = options.lm
	print >> sys.stdout, 'will write LM to', fname, '...'
	lm = LmArpaWriter(gOpenOut(fname), options.order - 1, notice)
    elif options.lm_format == 'estar':
	filePrefix, fileSuffix = os.path.splitext(options.lm)
	print >> sys.stdout, 'will write LM to %s-*%s ...' % (filePrefix, fileSuffix)
	lm = LmEstarWriter(filePrefix, fileSuffix, notice)
    else:
	raise ValueError(options.lm_format)
    return lm
예제 #7
0
def makeLmWriter(options):
    if options.lm_format == "arpa":
        fname = options.lm
        print("will write LM to", fname, "...", file=sys.stdout)
        lm = LmArpaWriter(gOpenOut(fname), options.order - 1, notice)
    elif options.lm_format == "estar":
        filePrefix, fileSuffix = os.path.splitext(options.lm)
        print("will write LM to %s-*%s ..." % (filePrefix, fileSuffix), file=sys.stdout)
        lm = LmEstarWriter(filePrefix, fileSuffix, notice)
    else:
        raise ValueError(options.lm_format)
    return lm
예제 #8
0
def makeLmWriter(options):
    if options.lm_format == 'arpa':
	fname = options.lm
	print >> sys.stdout, 'will write LM to', fname, '...'
	lm = LmArpaWriter(gOpenOut(fname), options.order - 1, notice)
    elif options.lm_format == 'estar':
	filePrefix, fileSuffix = os.path.splitext(options.lm)
	print >> sys.stdout, 'will write LM to %s-*%s ...' % (filePrefix, fileSuffix)
	lm = LmEstarWriter(filePrefix, fileSuffix, notice)
    else:
	raise ValueError(options.lm_format)
    return lm
예제 #9
0
def main(options, args):
    import locale
    if options.phoneme_to_phoneme:
        loadSample = loadP2PSample
    else:
        loadSample = loadG2PSample

    enc = locale.getpreferredencoding()
    if hasattr(sys.stdout, 'buffer'):
        log_stdout = codecs.getwriter(enc)(sys.stdout.buffer,
                                           errors='backslashreplace')
    else:
        log_stdout = codecs.getwriter(enc)(sys.stdout,
                                           errors='backslashreplace')

    if hasattr(sys.stderr, 'buffer'):
        log_stderr = codecs.getwriter(enc)(sys.stderr.buffer,
                                           errors='backslashreplace')
    else:
        log_stderr = codecs.getwriter(enc)(sys.stderr,
                                           errors='backslashreplace')

    #the encoding relates to the lexicon, not the standard IO
    #log_stdout = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stdout) if options.encoding else sys.stdout;
    #log_stderr = codecs.getwriter(options.encoding, errors='backslashreplace')(sys.stderr) if options.encoding else sys.stderr;

    if options.fakeTranslator:
        translator = MemoryTranslator(loadSample(options.fakeTranslator))
    else:
        model = SequiturTool.procureModel(options, loadSample, log=log_stdout)
        if not model:
            return 1
        if options.testSample or options.applySample or options.applyWord:
            translator = Translator(model)
            if options.stack_limit:
                translator.setStackLimit(options.stack_limit)
        del model

    if options.testSample:
        mainTest(translator, loadSample(options.testSample), options,
                 log_stdout)
        translator.reportStats(log_stdout)

    if options.applySample:
        mainApply(translator, options,
                  gOpenOut('-', options.encoding or defaultEncoding))
        translator.reportStats(log_stderr)

    if options.applyWord:
        mainApplyWord(translator, options, log_stdout)
예제 #10
0
def mainTest(translator, testSample, options):
    if options.shouldTranspose:
        testSample = SequiturTool.transposeSample(testSample)
    if options.testResult:
        resultFile = gOpenOut(options.testResult, defaultEncoding)
    else:
        resultFile = None
    from Evaluation import Evaluator
    evaluator = Evaluator()
    evaluator.setSample(testSample)
    evaluator.resultFile = resultFile
    evaluator.verboseLog = stdout
    if options.test_segmental:
        supraSegmental = set(['.', "'", '"'])
        def removeSupraSegmental(phon):
            return filter(lambda p: p not in supraSegmental, phon)
        evaluator.compareFilter = removeSupraSegmental
    result = evaluator.evaluate(translator)
    print >> stdout, result
예제 #11
0
def main(options, args):
    # 1. load reference lexicon
    print('loading reference lexicon ...')
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print('loading subliminal lexicon ...')
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print('loading subliminal g2p model ...')
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print('loading g2p model ...')
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print('stripped number of multigrams from %d to %d' % (oldSize, newSize))

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print('creating extended lexicon ...')
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(filter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print(p, file=f)
	else:
	    for w in vocabulary:
		if w is not None:
		    print(w, file=f)

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print('creating sequence model events ...')
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print(repr(event), '\t', count, file=f)

    # 6. count LM events
    if options.write_counts:
	print('creating sequence model counts ...')
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print('dumping fragments ...')
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in list(fragments.keys()):
            print(event, '\t', ' '.join(fragments[event]), file=f)

    # 8. dump modified LM training text
    if options.write_lm_text:
        print('dumping modified LM training text ...')
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print(" ".join(modWords), file=f)
예제 #12
0
def main(options, args):
    # 1. load reference lexicon
    print 'loading reference lexicon ...'
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print 'loading subliminal lexicon ...'
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print 'loading subliminal g2p model ...'
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print 'loading g2p model ...'
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print 'stripped number of multigrams from %d to %d' % (oldSize, newSize)

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print 'creating extended lexicon ...'
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(ifilter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print >> f, p
	else:
	    for w in vocabulary:
		if w is not None:
		    print >> f, w

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print 'creating sequence model events ...'
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print >> f, repr(event), '\t', count

    # 6. count LM events
    if options.write_counts:
	print 'creating sequence model counts ...'
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print 'dumping fragments ...'
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in fragments.keys():
            print >> f, event, '\t', ' '.join(fragments[event])

    # 8. dump modified LM training text
    if options.write_lm_text:
        print 'dumping modified LM training text ...'
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print >> f, " ".join(modWords)
예제 #13
0
    def set(self, other):
	file = gOpenOut(self.fname)
	self.write(file, othe, self.outputConversion)
	file.close()
예제 #14
0
 def set(self, other):
     file = gOpenOut(self.fname)
     self.write(file, othe, self.outputConversion)
     file.close()
예제 #15
0
    def boSection(self, order):
	f = gOpenOut(self.filename('%dbo' % (order + 1)))
	comment = 'This is a modfied back-off %d-gram distribution file.\n' % (order + 1)
	if self.notice: comment = notice + '\n' + comment
	part = self.Writer(f, self.vocabulary, comment)
	return part