예제 #1
0
#
# Reads a dictionary and a file that contains a list of words that should be
# removed. Then prints the dictionary with those words excluded.

import argparse
import sys
import codecs
from pronunciationdictionary import PronunciationDictionary
from filetypes import TextFileType, BinaryFileType

parser = argparse.ArgumentParser()
parser.add_argument('dictionary',
                    type=BinaryFileType('r'),
                    help='the source dictionary')
parser.add_argument('wordlist',
                    type=TextFileType('r'),
                    help='file containing a list of words to be excluded')
parser.add_argument(
    '-c',
    '--count',
    type=int,
    dest='exclude_count',
    default=None,
    help=
    "number of dictionary entries to remove (the default is all the entries in the word list)"
)
parser.add_argument('-k',
                    '--keep-words',
                    action='store_true',
                    default=False,
                    help="leave at least one pronunciation for every word")
예제 #2
0
#
#   WORD[:N] WORDS1:ORIGERR1:ERR1 WORDS2:ORIGERR2:ERR2 ...
#
# :N indicates Nth pronunciation variant, and is only given for words with
# multiple pronunciation variants. WORDSi is the number of words in the ith
# utterance. ORIGERRi is the number of word errors in the best path of the
# original pronunciation lattice. ERRi is the number of word errors in the
# best path of the pronunciation lattice without WORD:N.

import argparse
import sys
import codecs
from filetypes import TextFileType

parser = argparse.ArgumentParser()
parser.add_argument('errors', type=TextFileType('r'), help='input errors file')
parser.add_argument('--add-one-smoothing',
                    action='store_true',
                    default=False,
                    help='add one to total error increase for each word')
parser.add_argument(
    '--algorithm',
    type=str,
    default="wer_dec",
    help=
    'scoring algorithm (err_dec = total error decrease, wer_dec = average error decrease per utterance)'
)
parser.add_argument('--max-err-inc',
                    type=int,
                    default=9999,
                    help='set a maximum for error increase per utterance')
예제 #3
0
class Chunk:
	def __init__(self, number, prefix):
		self.file = open(prefix + '.' + str(number), 'w', encoding='utf-8')
		self.bytes = 0

	def __del__(self):
		self.file.close()

	def write(self, text):
		self.file.write(text)
		self.bytes += len(line.encode('utf-8'))


parser = argparse.ArgumentParser()
parser.add_argument('pages', type=TextFileType('r'), nargs='?', default='-', help='input pages file')
parser.add_argument('-b', '--bytes', type=int, default=536870912, help='target chunk size in bytes')
parser.add_argument('-o', '--offset', type=int, default=None, help='target offset between chunks in bytes')
parser.add_argument('--prefix', type=str, default='x', help='output file prefix')
args = parser.parse_args()

if args.offset is None:
	args.offset = args.bytes

chunk_number = 1
chunks = [Chunk(chunk_number, args.prefix)]
chunk_number += 1

current_offset = 0
for line in args.pages:
	if line.startswith("###### "):
예제 #4
0
		for word in line.split():
			if not word in vocabulary:
				vocabulary.add(word)
				if word in counts:
					num_oov_words -= counts[word]
					del counts[word]
			word_count += 1
			if word_count % 100000 == 0:
				oov_rate = float(num_oov_words) / num_test_words
				sys.stdout.write("%i, %i, %i, %f\n" % (word_count, len(vocabulary), num_oov_words, oov_rate))
				sys.stderr.write("word_count=%i, vocabulary=%i, num_oov_words=%i, oov_rate=%f\n" % (word_count, len(vocabulary), num_oov_words, oov_rate))
			if word_count > word_limit:
				return

parser = argparse.ArgumentParser()
parser.add_argument('text', type=TextFileType('r'), help='input text file')
parser.add_argument('--limit', type=int, default=None)
args = parser.parse_args()

counts = {}
for line in args.text:
	for word in line.split():
		if not word in counts:
			counts[word] = 1
		else:
			counts[word] += 1
args.text.close()

print_stats(counts, args.limit)

sorted_counts = sorted(counts, key=counts.get)
예제 #5
0
        stdout = score_str + '\n' + stdout

    sys.stderr.write("Return code: %d\n" % return_code)
    if stdout != '':
        sys.stderr.write("Standard output:\n")
        sys.stderr.write("%s\n" % stdout)
    if stderr != '':
        sys.stderr.write("Standard error:\n")
        sys.stderr.write("%s\n" % stderr)
    sys.stderr.write(score_str)


parser = argparse.ArgumentParser()
parser.add_argument('command', type=str, help='a command that scores text')
parser.add_argument('input',
                    type=TextFileType('r'),
                    nargs='+',
                    help='input text page files')
parser.add_argument('--in-memory',
                    action='store_true',
                    default=False,
                    help='load the entire data set into memory')
parser.add_argument(
    '--unit',
    type=str,
    default='each',
    help=
    'send one page at a time ("each"), everything but one page at a time ("exclude"), or all at once ("all")'
)
parser.add_argument('--scores',
                    type=str,
예제 #6
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# Reads a list of word pairs, one pair per line, and shows the
# edit operations needed to transform one word into the other.

import argparse
import sys
from filetypes import TextFileType
from editpartitioning import EditPartitioning
from finnishreductions import validate

parser = argparse.ArgumentParser()
parser.add_argument('wordpairs', type=TextFileType('r'), help='file containing word pairs, one per line')
parser.add_argument('--validate-finnish', action='store_true', default=False, help='prints only pairs that are not conversational Finnish reductions')
args = parser.parse_args()

for line in args.wordpairs:
	line = line.strip()
	if len(line) == 0:
		continue
	words = line.split()
	if len(words) != 2:
		sys.stderr.write("Invalid word pair: " + line + "\n")
		continue
	edits = EditPartitioning(words[1], words[0])
	edits.clean()
	if args.validate_finnish:
		if validate(edits.partitions):
			print(edits, "\t\tVALID")
		else:
예제 #7
0
#
# Author: Seppo Enarvi
# http://users.marjaniemi.com/seppo/

import argparse
import sys
import io
import operator
import subprocess
import tempfile
from pages import *
from perplexity import *
from filetypes import TextFileType

parser = argparse.ArgumentParser()
parser.add_argument('scores', type=TextFileType('r'), help='input scores file')
parser.add_argument('input',
                    type=TextFileType('r'),
                    nargs='+',
                    help='input text page files')
parser.add_argument('--output',
                    type=TextFileType('w'),
                    default='-',
                    help='output file for sorted text pages')
parser.add_argument('--in-memory',
                    action='store_true',
                    default=False,
                    help='load the entire data set into memory')
parser.add_argument(
    '--merge-fragments',
    action='store_true',
예제 #8
0
#
#   log(exp(score1 * scale1) * (1-lambda) + exp(score2 * scale2) * lambda))
#
# All the log probabilities are base 10.
#
# Author: Seppo Enarvi
# http://users.marjaniemi.com/seppo/

import argparse
import sys
from decimal import *
from filetypes import TextFileType

parser = argparse.ArgumentParser()
parser.add_argument(
    'nbestlist', type=TextFileType('r'),
    help='n-best list in SRILM format with each line prefixed by utterance ID')
parser.add_argument(
    'newscores', type=TextFileType('r'),
    help='a file containing a new LM score for each hypothesis in the n-best list')
parser.add_argument(
    '--scale1', metavar='SCALE', type=float, default=1.0,
    help='scale old LM probabilities by this factor')
parser.add_argument(
    '--scale2', metavar='SCALE', type=float, default=1.0,
    help='scale new LM probabilities by this factor')
parser.add_argument(
    '--lambda', metavar='LAMBDA', dest='lambda2', type=float, default=0.5,
    help='interpolation weight to apply to the new probabilities')
args = parser.parse_args()
예제 #9
0
            if limit is not None:
                result += page.content()
                if result_length > limit:
                    break

    if oov_words > 0:
        sys.stderr.write('select_text: %i words not in vocabulary.\n' %
                         oov_words)

    sys.stderr.write('%i words selected, divergence: %f\n' %
                     (result_length, div))
    return result


parser = argparse.ArgumentParser()
parser.add_argument('pages', type=TextFileType('r'), help='input text pages')
parser.add_argument('ndssample',
                    type=TextFileType('r'),
                    help='a random sample from the non-domain-specific data')
parser.add_argument('idmodel',
                    type=TextFileType('r'),
                    help='unigram in-domain language model')
parser.add_argument(
    '--alpha',
    type=float,
    default=1,
    help='the skew divergence parameter denoting the smoothing influence')
parser.add_argument(
    '--max-div-inc',
    type=float,
    default=0,
예제 #10
0
        if "warning" in line:
            continue
        # The end node was unreachable.
        return ""

    try:
        hypothesis_pos = output.index(' ') + 1
        return output[hypothesis_pos:]
    except ValueError:
        sys.stderr.write(
            'Warning: no space in lattice-tool output ref "%s".\n' % output)
        return ''


parser = argparse.ArgumentParser()
parser.add_argument('lattice', type=TextFileType('r'), help='a lattice file')
parser.add_argument('--exclude',
                    dest='exclude_always',
                    metavar='word',
                    type=str,
                    nargs='*',
                    default=[],
                    help='words to exclude from every decoding')
parser.add_argument(
    '--exclude-individually',
    dest='exclude_once',
    metavar='word',
    type=str,
    nargs='*',
    default=[],
    help=
예제 #11
0
parser.add_argument('--scores', type=str, default='-', help='output scores file')
parser.add_argument('--merge-fragments', action='store_true', default=False, help='merge pages whose URI only differs after fragment identifier')
parser.add_argument('-B', '--batch', type=int, dest='num_batches', default=1, help='number of batches to split the job into')
parser.add_argument('-I', '--bindex', type=int, dest='batch_index', default=1, help='index of this batch, starting from 1')
args = parser.parse_args()

if args.batch_index > args.num_batches:
	sys.stderr.write("Batch index has to be smaller than or equal to the number of batches.\n")
	sys.exit(2)
if args.batch_index < 1:
	sys.stderr.write("Batch indices start from 1.\n")
	sys.exit(2)

existing_uris = set()
if args.scores == '-':
	scores_file = TextFileType('w')(args.scores)
else:
	try:
		scores_file = TextFileType('r')(args.scores)
		for line in scores_file:
			if line.startswith('###### '):
				existing_uris.add(line[7:].strip())
		scores_file.close()
		scores_file = TextFileType('a')(args.scores)
		scores_file = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')
	except argparse.ArgumentTypeError:
		scores_file = TextFileType('w')(args.scores)

# Send pages according to --unit.
if args.unit == "all":
	process = subprocess.Popen([args.command], stdin=subprocess.PIPE, stdout=subprocess.PIPE);
예제 #12
0
            self.alternatives.sort(key=lambda x: x.logprob(), reverse=True)
            self.alternatives = self.alternatives[:max_alternatives]

    def size(self):
        return len(self.alternatives)

    def best(self):
        if len(self.alternatives) > 0:
            return max(self.alternatives, key=lambda x: x.logprob())
        else:
            return None


parser = argparse.ArgumentParser()
parser.add_argument('lm',
                    type=TextFileType('r'),
                    help='arpa language model file')
parser.add_argument('trn', type=TextFileType('r'), help='transcript file')
parser.add_argument(
    '--max-alternatives',
    type=int,
    default=None,
    help='maximum number of best alternatives to keep in memory at a time')
args = parser.parse_args()

lm = ArpaLM(args.lm)
args.lm.close()

trn = Transcripts()
trn.read_trn(args.trn)
args.trn.close()