def SymbolsToFile(filename, symbols_and_labels, print_info=False): with utf8.open(filename, mode='w') as writer: WriteOpenFstSymbolTable(writer, symbols_and_labels) if print_info: utf8.stderr.write('Wrote %s with %d symbols\n' % (filename, len(symbols_and_labels))) return
def main(argv): if len(argv) < 2: _stderr.write( 'Not enough arguments. Use --help for usage information.\n') sys.exit(1) with utf8.open(argv[1]) as reader: golden = ReadLexicon(reader) if len(argv) == 3: with utf8.open(argv[2]) as reader: predicted = ReadLexicon(reader) else: predicted = ReadLexicon(utf8.stdin) CompareLexica(golden, predicted, _stdout) # Uses oracle. # Alternatively, CompareLexica(golden, predicted, _stdout, 1, 1) # would compute word/phone error based on the top golden and top # predicted pronunciation. return
def GetSampaToIpaMapping(path): mapping = {} with utf8.open(path) as reader: for line in reader: line = line.rstrip('\n') fields = line.split('\t') assert len(fields) == 2 sampa, ipa = fields assert sampa not in mapping mapping[sampa] = ipa return mapping
def main(argv): if len(argv) == 1: STDOUT.write('Usage: %s LEXICON...\n' % argv[0]) sys.exit(2) lex = {} for path in argv[1:]: if path == '-': lex.update(ReadTsvLexicon(STDIN)) else: with utf8.open(path) as reader: lex.update(ReadTsvLexicon(reader)) for orth in sorted(lex): for pron in lex[orth]: STDOUT.write('%s\t%s\n' % (orth, pron)) return
def main(argv): cs = set() if len(argv) > 1: with utf8.open(argv[1]) as reader: for line in reader: for c in line.split(): cs.add(c) else: for line in utf8.stdin: for c in line.split(): cs.add(c) uniquely_decodable = IsUniquelyDecodable(cs, utf8.stderr) sys.exit(0 if uniquely_decodable else 1) return
def ReadGraphemeDataFromFile(path): with utf8.open(path, mode='r') as reader: label = 0xF000 for line in reader: line = line.strip('\n') if not line or line.startswith('#'): continue fields = line.split('\t') assert len(fields) >= 2 if len(fields) > 2: codepoints = fields[2] else: codepoints = '' yield fields[0], fields[1], codepoints, label label += 1 return
def main(args): if len(args) < 2 or len(args) > 3: STDOUT.write('Usage: %s GOLDEN_DICTIONARY\n' % args[0]) sys.exit(1) if len(args) == 3: target = float(args[2]) else: target = -1 golden = {} with utf8.open(args[1]) as reader: for key, val in ReadTSV(reader): if key not in golden: golden[key] = [] golden[key].append(val) total = 0 correct = 0 for key, val in ReadTSV(STDIN): if key not in golden: STDOUT.write('Skipping %s; not in golden dictionary\n') continue total += 1 if val in golden[key]: correct += 1 if total == 0: STDOUT.write('Total number of words that can be evaluated is zero.\n') sys.exit(2) accuracy = correct * 100.0 / total STDOUT.write('Accuracy: %d / %d = %g %%\n' % (correct, total, accuracy)) STDERR.write('Accuracy: %d / %d = %g %%\n' % (correct, total, accuracy)) if target < 0: sys.exit(0) elif accuracy > target: STDERR.write('PASS\n') sys.exit(0) else: STDERR.write('FAIL\n') sys.exit(1) return
def TestPronunciationRules(xltor, mapping, dictionary): # Batch testing against a dictionary. success = True with utf8.open(dictionary) as reader: for line in reader: line = line.rstrip('\n') fields = line.split('\t') assert len(fields) == 2 orth, pron = fields sampa = pron.split() ipa = ''.join(mapping[p] for p in sampa) if orth in EXCEPTIONAL_WORDS: continue predicted = xltor.transliterate(orth) if predicted != ipa: utf8.Print('%s\t%s\t%s != %s' % (orth, ' '.join(sampa), ipa, predicted)) success = False return success