# # Reads a dictionary and a file that contains a list of words that should be # removed. Then prints the dictionary with those words excluded. import argparse import sys import codecs from pronunciationdictionary import PronunciationDictionary from filetypes import TextFileType, BinaryFileType parser = argparse.ArgumentParser() parser.add_argument('dictionary', type=BinaryFileType('r'), help='the source dictionary') parser.add_argument('wordlist', type=TextFileType('r'), help='file containing a list of words to be excluded') parser.add_argument( '-c', '--count', type=int, dest='exclude_count', default=None, help= "number of dictionary entries to remove (the default is all the entries in the word list)" ) parser.add_argument('-k', '--keep-words', action='store_true', default=False, help="leave at least one pronunciation for every word")
# # WORD[:N] WORDS1:ORIGERR1:ERR1 WORDS2:ORIGERR2:ERR2 ... # # :N indicates Nth pronunciation variant, and is only given for words with # multiple pronunciation variants. WORDSi is the number of words in the ith # utterance. ORIGERRi is the number of word errors in the best path of the # original pronunciation lattice. ERRi is the number of word errors in the # best path of the pronunciation lattice without WORD:N. import argparse import sys import codecs from filetypes import TextFileType parser = argparse.ArgumentParser() parser.add_argument('errors', type=TextFileType('r'), help='input errors file') parser.add_argument('--add-one-smoothing', action='store_true', default=False, help='add one to total error increase for each word') parser.add_argument( '--algorithm', type=str, default="wer_dec", help= 'scoring algorithm (err_dec = total error decrease, wer_dec = average error decrease per utterance)' ) parser.add_argument('--max-err-inc', type=int, default=9999, help='set a maximum for error increase per utterance')
class Chunk: def __init__(self, number, prefix): self.file = open(prefix + '.' + str(number), 'w', encoding='utf-8') self.bytes = 0 def __del__(self): self.file.close() def write(self, text): self.file.write(text) self.bytes += len(line.encode('utf-8')) parser = argparse.ArgumentParser() parser.add_argument('pages', type=TextFileType('r'), nargs='?', default='-', help='input pages file') parser.add_argument('-b', '--bytes', type=int, default=536870912, help='target chunk size in bytes') parser.add_argument('-o', '--offset', type=int, default=None, help='target offset between chunks in bytes') parser.add_argument('--prefix', type=str, default='x', help='output file prefix') args = parser.parse_args() if args.offset is None: args.offset = args.bytes chunk_number = 1 chunks = [Chunk(chunk_number, args.prefix)] chunk_number += 1 current_offset = 0 for line in args.pages: if line.startswith("###### "):
for word in line.split(): if not word in vocabulary: vocabulary.add(word) if word in counts: num_oov_words -= counts[word] del counts[word] word_count += 1 if word_count % 100000 == 0: oov_rate = float(num_oov_words) / num_test_words sys.stdout.write("%i, %i, %i, %f\n" % (word_count, len(vocabulary), num_oov_words, oov_rate)) sys.stderr.write("word_count=%i, vocabulary=%i, num_oov_words=%i, oov_rate=%f\n" % (word_count, len(vocabulary), num_oov_words, oov_rate)) if word_count > word_limit: return parser = argparse.ArgumentParser() parser.add_argument('text', type=TextFileType('r'), help='input text file') parser.add_argument('--limit', type=int, default=None) args = parser.parse_args() counts = {} for line in args.text: for word in line.split(): if not word in counts: counts[word] = 1 else: counts[word] += 1 args.text.close() print_stats(counts, args.limit) sorted_counts = sorted(counts, key=counts.get)
stdout = score_str + '\n' + stdout sys.stderr.write("Return code: %d\n" % return_code) if stdout != '': sys.stderr.write("Standard output:\n") sys.stderr.write("%s\n" % stdout) if stderr != '': sys.stderr.write("Standard error:\n") sys.stderr.write("%s\n" % stderr) sys.stderr.write(score_str) parser = argparse.ArgumentParser() parser.add_argument('command', type=str, help='a command that scores text') parser.add_argument('input', type=TextFileType('r'), nargs='+', help='input text page files') parser.add_argument('--in-memory', action='store_true', default=False, help='load the entire data set into memory') parser.add_argument( '--unit', type=str, default='each', help= 'send one page at a time ("each"), everything but one page at a time ("exclude"), or all at once ("all")' ) parser.add_argument('--scores', type=str,
#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # Reads a list of word pairs, one pair per line, and shows the # edit operations needed to transform one word into the other. import argparse import sys from filetypes import TextFileType from editpartitioning import EditPartitioning from finnishreductions import validate parser = argparse.ArgumentParser() parser.add_argument('wordpairs', type=TextFileType('r'), help='file containing word pairs, one per line') parser.add_argument('--validate-finnish', action='store_true', default=False, help='prints only pairs that are not conversational Finnish reductions') args = parser.parse_args() for line in args.wordpairs: line = line.strip() if len(line) == 0: continue words = line.split() if len(words) != 2: sys.stderr.write("Invalid word pair: " + line + "\n") continue edits = EditPartitioning(words[1], words[0]) edits.clean() if args.validate_finnish: if validate(edits.partitions): print(edits, "\t\tVALID") else:
# # Author: Seppo Enarvi # http://users.marjaniemi.com/seppo/ import argparse import sys import io import operator import subprocess import tempfile from pages import * from perplexity import * from filetypes import TextFileType parser = argparse.ArgumentParser() parser.add_argument('scores', type=TextFileType('r'), help='input scores file') parser.add_argument('input', type=TextFileType('r'), nargs='+', help='input text page files') parser.add_argument('--output', type=TextFileType('w'), default='-', help='output file for sorted text pages') parser.add_argument('--in-memory', action='store_true', default=False, help='load the entire data set into memory') parser.add_argument( '--merge-fragments', action='store_true',
# # log(exp(score1 * scale1) * (1-lambda) + exp(score2 * scale2) * lambda)) # # All the log probabilities are base 10. # # Author: Seppo Enarvi # http://users.marjaniemi.com/seppo/ import argparse import sys from decimal import * from filetypes import TextFileType parser = argparse.ArgumentParser() parser.add_argument( 'nbestlist', type=TextFileType('r'), help='n-best list in SRILM format with each line prefixed by utterance ID') parser.add_argument( 'newscores', type=TextFileType('r'), help='a file containing a new LM score for each hypothesis in the n-best list') parser.add_argument( '--scale1', metavar='SCALE', type=float, default=1.0, help='scale old LM probabilities by this factor') parser.add_argument( '--scale2', metavar='SCALE', type=float, default=1.0, help='scale new LM probabilities by this factor') parser.add_argument( '--lambda', metavar='LAMBDA', dest='lambda2', type=float, default=0.5, help='interpolation weight to apply to the new probabilities') args = parser.parse_args()
if limit is not None: result += page.content() if result_length > limit: break if oov_words > 0: sys.stderr.write('select_text: %i words not in vocabulary.\n' % oov_words) sys.stderr.write('%i words selected, divergence: %f\n' % (result_length, div)) return result parser = argparse.ArgumentParser() parser.add_argument('pages', type=TextFileType('r'), help='input text pages') parser.add_argument('ndssample', type=TextFileType('r'), help='a random sample from the non-domain-specific data') parser.add_argument('idmodel', type=TextFileType('r'), help='unigram in-domain language model') parser.add_argument( '--alpha', type=float, default=1, help='the skew divergence parameter denoting the smoothing influence') parser.add_argument( '--max-div-inc', type=float, default=0,
if "warning" in line: continue # The end node was unreachable. return "" try: hypothesis_pos = output.index(' ') + 1 return output[hypothesis_pos:] except ValueError: sys.stderr.write( 'Warning: no space in lattice-tool output ref "%s".\n' % output) return '' parser = argparse.ArgumentParser() parser.add_argument('lattice', type=TextFileType('r'), help='a lattice file') parser.add_argument('--exclude', dest='exclude_always', metavar='word', type=str, nargs='*', default=[], help='words to exclude from every decoding') parser.add_argument( '--exclude-individually', dest='exclude_once', metavar='word', type=str, nargs='*', default=[], help=
parser.add_argument('--scores', type=str, default='-', help='output scores file') parser.add_argument('--merge-fragments', action='store_true', default=False, help='merge pages whose URI only differs after fragment identifier') parser.add_argument('-B', '--batch', type=int, dest='num_batches', default=1, help='number of batches to split the job into') parser.add_argument('-I', '--bindex', type=int, dest='batch_index', default=1, help='index of this batch, starting from 1') args = parser.parse_args() if args.batch_index > args.num_batches: sys.stderr.write("Batch index has to be smaller than or equal to the number of batches.\n") sys.exit(2) if args.batch_index < 1: sys.stderr.write("Batch indices start from 1.\n") sys.exit(2) existing_uris = set() if args.scores == '-': scores_file = TextFileType('w')(args.scores) else: try: scores_file = TextFileType('r')(args.scores) for line in scores_file: if line.startswith('###### '): existing_uris.add(line[7:].strip()) scores_file.close() scores_file = TextFileType('a')(args.scores) scores_file = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8') except argparse.ArgumentTypeError: scores_file = TextFileType('w')(args.scores) # Send pages according to --unit. if args.unit == "all": process = subprocess.Popen([args.command], stdin=subprocess.PIPE, stdout=subprocess.PIPE);
self.alternatives.sort(key=lambda x: x.logprob(), reverse=True) self.alternatives = self.alternatives[:max_alternatives] def size(self): return len(self.alternatives) def best(self): if len(self.alternatives) > 0: return max(self.alternatives, key=lambda x: x.logprob()) else: return None parser = argparse.ArgumentParser() parser.add_argument('lm', type=TextFileType('r'), help='arpa language model file') parser.add_argument('trn', type=TextFileType('r'), help='transcript file') parser.add_argument( '--max-alternatives', type=int, default=None, help='maximum number of best alternatives to keep in memory at a time') args = parser.parse_args() lm = ArpaLM(args.lm) args.lm.close() trn = Transcripts() trn.read_trn(args.trn) args.trn.close()