def SymbolsToFile(filename, symbols_and_labels, print_info=False):
    with utf8.open(filename, mode='w') as writer:
        WriteOpenFstSymbolTable(writer, symbols_and_labels)
    if print_info:
        utf8.stderr.write('Wrote %s with %d symbols\n' %
                          (filename, len(symbols_and_labels)))
    return
示例#2
0
def main(argv):
    if len(argv) < 2:
        _stderr.write(
            'Not enough arguments. Use --help for usage information.\n')
        sys.exit(1)

    with utf8.open(argv[1]) as reader:
        golden = ReadLexicon(reader)

    if len(argv) == 3:
        with utf8.open(argv[2]) as reader:
            predicted = ReadLexicon(reader)
    else:
        predicted = ReadLexicon(utf8.stdin)

    CompareLexica(golden, predicted, _stdout)  # Uses oracle.
    # Alternatively, CompareLexica(golden, predicted, _stdout, 1, 1)
    # would compute word/phone error based on the top golden and top
    # predicted pronunciation.
    return
def GetSampaToIpaMapping(path):
  mapping = {}
  with utf8.open(path) as reader:
    for line in reader:
      line = line.rstrip('\n')
      fields = line.split('\t')
      assert len(fields) == 2
      sampa, ipa = fields
      assert sampa not in mapping
      mapping[sampa] = ipa
  return mapping
def main(argv):
    if len(argv) == 1:
        STDOUT.write('Usage: %s LEXICON...\n' % argv[0])
        sys.exit(2)
    lex = {}
    for path in argv[1:]:
        if path == '-':
            lex.update(ReadTsvLexicon(STDIN))
        else:
            with utf8.open(path) as reader:
                lex.update(ReadTsvLexicon(reader))
    for orth in sorted(lex):
        for pron in lex[orth]:
            STDOUT.write('%s\t%s\n' % (orth, pron))
    return
示例#5
0
def main(argv):
    cs = set()
    if len(argv) > 1:
        with utf8.open(argv[1]) as reader:
            for line in reader:
                for c in line.split():
                    cs.add(c)
    else:
        for line in utf8.stdin:
            for c in line.split():
                cs.add(c)

    uniquely_decodable = IsUniquelyDecodable(cs, utf8.stderr)
    sys.exit(0 if uniquely_decodable else 1)
    return
def ReadGraphemeDataFromFile(path):
    with utf8.open(path, mode='r') as reader:
        label = 0xF000
        for line in reader:
            line = line.strip('\n')
            if not line or line.startswith('#'):
                continue
            fields = line.split('\t')
            assert len(fields) >= 2
            if len(fields) > 2:
                codepoints = fields[2]
            else:
                codepoints = ''
            yield fields[0], fields[1], codepoints, label
            label += 1
    return
def main(args):
    if len(args) < 2 or len(args) > 3:
        STDOUT.write('Usage: %s GOLDEN_DICTIONARY\n' % args[0])
        sys.exit(1)

    if len(args) == 3:
        target = float(args[2])
    else:
        target = -1

    golden = {}
    with utf8.open(args[1]) as reader:
        for key, val in ReadTSV(reader):
            if key not in golden:
                golden[key] = []
            golden[key].append(val)

    total = 0
    correct = 0
    for key, val in ReadTSV(STDIN):
        if key not in golden:
            STDOUT.write('Skipping %s; not in golden dictionary\n')
            continue
        total += 1
        if val in golden[key]:
            correct += 1

    if total == 0:
        STDOUT.write('Total number of words that can be evaluated is zero.\n')
        sys.exit(2)

    accuracy = correct * 100.0 / total
    STDOUT.write('Accuracy: %d / %d = %g %%\n' % (correct, total, accuracy))
    STDERR.write('Accuracy: %d / %d = %g %%\n' % (correct, total, accuracy))

    if target < 0:
        sys.exit(0)
    elif accuracy > target:
        STDERR.write('PASS\n')
        sys.exit(0)
    else:
        STDERR.write('FAIL\n')
        sys.exit(1)
    return
def TestPronunciationRules(xltor, mapping, dictionary):
  # Batch testing against a dictionary.
  success = True
  with utf8.open(dictionary) as reader:
    for line in reader:
      line = line.rstrip('\n')
      fields = line.split('\t')
      assert len(fields) == 2
      orth, pron = fields
      sampa = pron.split()
      ipa = ''.join(mapping[p] for p in sampa)
      if orth in EXCEPTIONAL_WORDS:
        continue
      predicted = xltor.transliterate(orth)
      if predicted != ipa:
        utf8.Print('%s\t%s\t%s != %s' %
                   (orth, ' '.join(sampa), ipa, predicted))
        success = False
  return success