示例#1
0
def readApply(fname):
    for line in gOpenIn(fname, defaultEncoding):
        word = line.strip()
        #print word
        left = tuple(word)
        #print line
        yield word, left
示例#2
0
 def iter(self, sorted=True, consolidated=True):
     for line in gOpenIn(self.fname):
         fields = line.split()
         mGram = map(self.inputConversion, fields[:-1])
         mGram.reverse()
         item = (tuple(mGram[1:]), mGram[0])
         value = self.value(fields[-1])
         yield item, value
示例#3
0
    def iter(self, sorted=True, consolidated=True):
	for line in gOpenIn(self.fname):
	    fields = line.split()
	    mGram = map(self.inputConversion, fields[:-1])
	    mGram.reverse()
	    item = (tuple(mGram[1:]), mGram[0])
	    value = self.value(fields[-1])
	    yield item, value
示例#4
0
def loadPlainSample(fname, encoding = None):
    sample = []
    for line in gOpenIn(fname, encoding or defaultEncoding):
        fields = line.split()
        if not fields: continue
        left  = tuple(fields[0])
        right = tuple(fields[1:])
        sample.append((left, right))
    return sample
示例#5
0
def loadPlainSample(fname, encoding=None):
    sample = []
    for line in gOpenIn(fname, encoding or defaultEncoding):
        fields = line.split()
        if not fields: continue
        left = tuple(fields[0])
        right = tuple(fields[1:])
        sample.append((left, right))
    return sample
示例#6
0
def loadBlissLexicon(fname):
    from elementtree.ElementTree import ElementTree
    xml = ElementTree(file=gOpenIn(fname))
    pronunciations = pronunciationsFromXmlLexicon(xml)
    result = [(orth, phon) for orth in pronunciations
              if not (orth.startswith('[') and orth.endswith(']'))
              for phon in pronunciations[orth]]
    result.sort()
    return result
示例#7
0
def loadBlissLexicon(fname):
    from elementtree.ElementTree import ElementTree
    xml = ElementTree(file = gOpenIn(fname))
    pronunciations = pronunciationsFromXmlLexicon(xml)
    result = [ (orth, phon)
               for orth in pronunciations
               if not (orth.startswith('[') and orth.endswith(']'))
               for phon in pronunciations[orth] ]
    result.sort()
    return result
    def templateTestRawCounts(self, StorageClass):
	text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz')
	sentences = itertools.imap(str.split, text)
	grams = mGramsChainCount(sentences, self.order)
	counts = StorageClass()
	counts.addIter(grams)

	f = EqualFile('tests/nab-mini-corpus.raw-counts.gz')
	TextStorage.write(f, counts)
	self.failUnless(f)
示例#9
0
    def templateTestRawCounts(self, StorageClass):
        text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz')
        sentences = map(str.split, text)
        grams = mGramsChainCount(sentences, self.order)
        counts = StorageClass()
        counts.addIter(grams)

        f = EqualFile('tests/nab-mini-corpus.raw-counts.gz')
        TextStorage.write(f, counts)
        self.assertTrue(f)
示例#10
0
def loadG2PSample(fname):
    if fname == '-':
        sample = loadPlainSample(fname)
    else:
        firstLine = gOpenIn(fname, defaultEncoding).readline()
        if firstLine.startswith('<?xml'):
            sample = [(tuple(orth), tuple(phon))
                      for orth, phon in loadBlissLexicon(fname)]
        else:
            sample = loadPlainSample(fname)
    return sample
示例#11
0
def loadG2PSample(fname):
    if fname == '-':
        sample = loadPlainSample(fname)
    else:
        firstLine = gOpenIn(fname, defaultEncoding).readline()
        if firstLine.startswith('<?xml'):
            sample = [ (tuple(orth), tuple(phon))
                       for orth, phon in loadBlissLexicon(fname) ]
        else:
            sample = loadPlainSample(fname)
    return sample
示例#12
0
    def templateTestMappedCounts(self, StorageClass):
	vocabulary = loadVocabulary('tests/nab-5k-vocabulary.txt.gz')
	text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz')
	sentences = itertools.imap(str.split, text)
	sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
	grams = mGramsChainCount(sentences, self.order)
	counts = StorageClass()
	counts.addIter(grams)

	f = EqualFile('tests/nab-mini-corpus.mapped-counts.gz')
	TextStorage.write(f, counts)
	self.failUnless(f)
示例#13
0
    def templateTestMappedCounts(self, StorageClass):
        vocabulary = loadVocabulary('tests/nab-5k-vocabulary.txt.gz')
        text = misc.gOpenIn('tests/nab-mini-corpus.txt.gz')
        sentences = map(str.split, text)
        sentences = map(lambda s: list(map(vocabulary.map, s)), sentences)
        grams = mGramsChainCount(sentences, self.order)
        counts = StorageClass()
        counts.addIter(grams)

        f = EqualFile('tests/nab-mini-corpus.mapped-counts.gz')
        TextStorage.write(f, counts)
        self.assertTrue(f)
示例#14
0
def readApplyP2G(fname, encoding = None):
    for line in gOpenIn(fname, encoding):
        line = line.rstrip()
        fields = line.split("\t")
        if len(fields) == 1:
            word = fields[0]
            left = tuple(fields[0].split())
        elif len(fields) == 2:
            word = fields[0]
            left = tuple(fields[1].split())
        else:
            print('unknown format in file:  %s' % (line), file = stderr)

        yield word, left
示例#15
0
def main(options, args):
    if options.vocabulary:
        vocabulary = loadVocabulary(options.vocabulary)
    else:
        vocabulary = OpenVocabulary()

    if options.text:
        text = misc.gOpenIn(options.text)
        sentences = itertools.imap(str.split, text)
        sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
        grams = mGramsChainCount(sentences, options.order - 1)
        counts = createStorage(options)
        counts.addIter(grams)
    elif options.read:
        if len(options.read) > 1:
            counts = createStorage(options)
            counts.addIter(
                consolidate(
                    mergeSort([TextStorage(fname) for fname in options.read])))
        else:
            counts = TextStorage(options.read[0])
    else:
        print("no counts", file=sys.stderr)
        return

    if options.map_oov:
        if not options.vocabulary:
            print("you need to specify a vocabulary", file=sys.stderr)
        filt = MapUnknownsFilter(counts, vocabulary.list,
                                 vocabulary.unknownSymbol)
        mappedCounts = createStorage(options)
        mappedCounts.addIter(filt.rawIter())
        counts = mappedCounts

    if options.write:
        countFile = misc.gOpenOut(options.write)
        TextStorage.write(countFile, counts)

    if options.counts_of_counts:
        coc = [
            countsOfCounts(mGramReduceToOrder(counts, order))
            for order in range(options.order)
        ]
        import pprint

        pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
示例#16
0
def main(options, args):
    builder = LanguageModelBuilder()
    builder.setLogFile(sys.stdout)

    vocabulary = loadVocabulary(options.vocabulary)
    builder.setVocabulary(vocabulary)

    builder.setHighestOrder(options.order - 1)

    if options.count_cutoffs:
        cutoffs = list(map(int, options.count_cutoffs.split()))
        builder.setCountCutoffs(cutoffs)

    binaryCountFile = options.read + '.bin'
    if os.path.isfile(binaryCountFile):
        counts = StoredCounts(binaryCountFile)
    else:
        counts = loadCounts(options.read, vocabulary, binaryCountFile)

    if options.counts_of_counts:
        coc = eval(gOpenIn(options.counts_of_counts).read())
    else:
        coc = [
            mGramCounts.countsOfCounts(
                mGramCounts.mGramReduceToOrder(counts, order))
            for order in range(options.order)
        ]

    maximumOrder = maximumCountsOrder(coc)
    if builder.highestOrder > maximumOrder:
        print('warning: no counts for orders above %d' % (maximumOrder + 1))
        builder.setHighestOrder(maximumOrder)

    builder.estimateDiscounts(coc)

    if options.lm:
        lm = makeLmWriter(options)
    else:
        lm = LmDummy()

    builder.build(counts, lm)

    if __debug__ and False:  ### TESTING
        print('verifying normalization ...', file=sys.stdout)
        lm2 = Lm(lm)
        lm2.checkNormalisation()
示例#17
0
def main(options, args):
    builder = LanguageModelBuilder()
    builder.setLogFile(sys.stdout)

    vocabulary = loadVocabulary(options.vocabulary)
    builder.setVocabulary(vocabulary)

    builder.setHighestOrder(options.order - 1)

    if options.count_cutoffs:
	cutoffs = map(int, options.count_cutoffs.split())
	builder.setCountCutoffs(cutoffs)

    binaryCountFile = options.read + '.bin'
    if os.path.isfile(binaryCountFile):
	counts = StoredCounts(binaryCountFile)
    else:
	counts = loadCounts(options.read, vocabulary, binaryCountFile)

    if options.counts_of_counts:
	coc = eval(gOpenIn(options.counts_of_counts).read())
    else:
	coc = [ mGramCounts.countsOfCounts(mGramCounts.mGramReduceToOrder(counts, order))
		for order in range(options.order) ]

    maximumOrder = maximumCountsOrder(coc)
    if builder.highestOrder > maximumOrder:
	print 'warning: no counts for orders above %d' % (maximumOrder+1)
	builder.setHighestOrder(maximumOrder)

    builder.estimateDiscounts(coc)

    if options.lm:
	lm = makeLmWriter(options)
    else:
	lm = LmDummy()

    builder.build(counts, lm)

    if __debug__ and False: ### TESTING
	print >> sys.stdout, 'verifying normalization ...'
	lm2 = Lm(lm)
	lm2.checkNormalisation()
示例#18
0
def main(options, args):
    if options.vocabulary:
	vocabulary = loadVocabulary(options.vocabulary)
    else:
	vocabulary = OpenVocabulary()

    if options.text:
	text = misc.gOpenIn(options.text)
	sentences = itertools.imap(str.split, text)
	sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
	grams = mGramsChainCount(sentences, options.order - 1)
	counts = createStorage(options)
	counts.addIter(grams)
    elif options.read:
	if len(options.read) > 1:
	    counts = createStorage(options)
	    counts.addIter(consolidate(mergeSort(
		[ TextStorage(fname) for fname in options.read ])))
	else:
	    counts = TextStorage(options.read[0])
    else:
	print >> sys.stderr, 'no counts'
	return

    if options.map_oov:
	if not options.vocabulary:
	    print >> sys.stderr, 'you need to specify a vocabulary'
	filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol)
	mappedCounts = createStorage(options)
	mappedCounts.addIter(filt.rawIter())
	counts = mappedCounts

    if options.write:
	countFile = misc.gOpenOut(options.write)
	TextStorage.write(countFile, counts)

    if options.counts_of_counts:
	coc = [ countsOfCounts(mGramReduceToOrder(counts, order))
		for order in range(options.order) ]
	import pprint
	pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
示例#19
0
def readApplyP2P(fname, encoding=None):
    for line in gOpenIn(fname, encoding):
        fields = line.split()
        word = fields[0]
        left = tuple(fields[1:])
        yield word, left
示例#20
0
def readApply(fname, encoding=None):
    for line in gOpenIn(fname, encoding):
        word = line.strip()
        left = tuple(word)
        yield word, left
示例#21
0
def readApply(fname):
    for line in gOpenIn(fname, defaultEncoding):
        word = line.strip()
        left = tuple(word)
        yield word, left
示例#22
0
def loadVocabulary(fname):
    vocabulary = ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    vocabulary.add([line.strip() for line in gOpenIn(fname)], soft=True)
    vocabulary.sort()
    return vocabulary
示例#23
0
def readApplyP2P(fname):
    for line in gOpenIn(fname, defaultEncoding):
        fields = line.split()
        word = fields[0]
        left = tuple(fields[1:])
        yield word, left
示例#24
0
def main(options, args):
    # 1. load reference lexicon
    print('loading reference lexicon ...')
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print('loading subliminal lexicon ...')
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print('loading subliminal g2p model ...')
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print('loading g2p model ...')
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print('stripped number of multigrams from %d to %d' % (oldSize, newSize))

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print('creating extended lexicon ...')
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(filter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print(p, file=f)
	else:
	    for w in vocabulary:
		if w is not None:
		    print(w, file=f)

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print('creating sequence model events ...')
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print(repr(event), '\t', count, file=f)

    # 6. count LM events
    if options.write_counts:
	print('creating sequence model counts ...')
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print('dumping fragments ...')
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in list(fragments.keys()):
            print(event, '\t', ' '.join(fragments[event]), file=f)

    # 8. dump modified LM training text
    if options.write_lm_text:
        print('dumping modified LM training text ...')
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print(" ".join(modWords), file=f)
示例#25
0
def main(options, args):
    # 1. load reference lexicon
    print 'loading reference lexicon ...'
    lexicon = loadBlissLexicon(options.lexicon)
    knownWords = set([ orth for orth, phon in lexicon ])

    # 2. load model for fragmentizing unknown words
    if options.subliminal_lexicon:
	print 'loading subliminal lexicon ...'
	subliminalLexicon = loadBlissLexicon(options.subliminal_lexicon)
    else:
	subliminalLexicon = None

    if options.subliminal_g2p:
	print 'loading subliminal g2p model ...'
	subliminalG2p = pickle.load(open(options.subliminal_g2p))
    else:
	subliminalG2p = None

    if options.g2pModel:
	print 'loading g2p model ...'
	model = pickle.load(open(options.g2pModel))
	oldSize, newSize = model.strip()
	print 'stripped number of multigrams from %d to %d' % (oldSize, newSize)

	fragmentizer = Fragmentizer(model)
	if subliminalLexicon:
	    fragmentizer.addSupervised(subliminalLexicon)
	if subliminalG2p:
	    fragmentizer.addSupervised(subliminalG2p)
	graphones = model.sequitur.symbols()
	graphones.remove(model.sequitur.symbol(model.sequitur.term))
    else:
	model = fragmentizer = graphones = None

    # 3. add fragments to lexicon
    if options.write_lexicon:
	print 'creating extended lexicon ...'
	xmlLexicon = ElementTree(file = options.lexicon)
	if options.model_type == 'phonemes':
	    changeSyntaticToPhonetic(xmlLexicon)
	else:
	    addGraphonesToLexicon(xmlLexicon, graphones)
	xmlLexicon.write(gOpenOut(options.write_lexicon), defaultEncoding)

    # 4. determine set of LM tokens
    vocabulary = mGramCounts.ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    if options.model_type == 'flat-hybrid':
	vocabulary.add(ifilter(isLmToken, knownWords), soft=True)
    if graphones:
	vocabulary.add(starmap(lmToken, graphones))
    vocabulary.sort()
    if options.write_tokens:
	f = gOpenOut(options.write_tokens, defaultEncoding)
	if options.model_type == 'phonemes':
	    phonemes = set(p for orth, phon in lexicon for p in phon)
	    phonemes.add('#1')
	    if 'si' in phonemes: phonemes.remove('si')
	    for p in sorted(phonemes):
		print >> f, p
	else:
	    for w in vocabulary:
		if w is not None:
		    print >> f, w

    # 5./6. set-up LM event generator
    if options.write_counts or options.write_events:
	order = options.order - 1
	if options.model_type == 'flat-hybrid':
	    events = HybridEventGenerator(knownWords, fragmentizer, order)
	    if options.range_type == 'fragments':
		events.setFragmentRange()
	    elif options.range_type == 'words':
		events.setTrueWordRange()
	    else:
		assert ValueError(options.range_type)
	elif options.model_type == 'fragments':
	    events = OovEventGenerator(knownWords, fragmentizer, order)
	elif options.model_type == 'phonemes':
	    events = PhonemeEventGenerator(lexicon, order)

    # 5. create modified LM training corpus counts
    if options.write_events:
	print 'creating sequence model events ...'
	f = gOpenOut(options.write_events, defaultEncoding)
	for event, count in events(gOpenIn(options.text, defaultEncoding)):
	    print >> f, repr(event), '\t', count

    # 6. count LM events
    if options.write_counts:
	print 'creating sequence model counts ...'
	counts = mGramCounts.SimpleMultifileStorage()
	counts.addIter(events(gOpenIn(options.text, defaultEncoding)))
	mGramCounts.TextStorage.write(gOpenOut(options.write_counts, defaultEncoding), counts)

    # 7. dump list of OOV words and their corresponding fragmentation
    if options.write_fragments:
        print 'dumping fragments ...'
        f = gOpenOut(options.write_fragments, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        fragments =  events(gOpenIn(options.text, defaultEncoding))
        for event in fragments.keys():
            print >> f, event, '\t', ' '.join(fragments[event])

    # 8. dump modified LM training text
    if options.write_lm_text:
        print 'dumping modified LM training text ...'
        f = gOpenOut(options.write_lm_text, defaultEncoding)
        events = OovFragmentGenerator(knownWords, fragmentizer)
        for line in gOpenIn(options.text, defaultEncoding):
            words = line.split()
            modWords =  events.modifyLmText(words)
            print >> f, " ".join(modWords)
示例#26
0
def loadVocabulary(fname):
    vocabulary = ClosedVocablary()
    vocabulary.add(['<s>', '</s>'])
    vocabulary.add([ line.strip() for line in gOpenIn(fname) ], soft=True)
    vocabulary.sort()
    return vocabulary