Exemplo n.º 1
0
        '--export-dir',
        dest='export_dir',
        help=
        'Export dir for nonsilence_phones.txt, silence_phones.txt and extra_questions.txt',
        type=str,
        default='data/local/dict/')

    args = parser.parse_args()

    if (args.singlefile != ''):
        ids = [args.singlefile]
    else:
        if args.filelist == '':
            print('No files specified for processing!')
            sys.exit()
        ids = common_utils.loadIdFile(args.filelist)

    combinedDict = {}

    for myid in ids:
        print("I'm now opening ", myid)
        importer = guessImportFunc(myid)
        d = importer(myid)
        combinedDict = merge_dicts(combinedDict, d)

    variants = 0
    for key in sorted(combinedDict.keys()):
        #print 'Word:',key,combinedDict[key]
        variants += len(combinedDict[key])

    print('Dictionary size is ', len(combinedDict),
    parser.add_argument('-p',
                        '--utterance-postfix-name',
                        dest='postfix',
                        help='--utterance-postfix-name',
                        type=str,
                        default='')

    args = parser.parse_args()

    if args.filelist == '':
        print 'Corpus filelist is empty. Use -f to supply a filelist!'
    else:

        print 'Load ', args.filelist, ', ommit ', args.remove_extension

        ids_raw = common_utils.loadIdFile(
            args.filelist, remove_extension=args.remove_extension)
        print 'I have', len(
            ids_raw
        ), 'files. Some may have their audio missing, I\'ll check that for you...'
        ids = []
        #check for missing wav files:

        omitted = 0
        for myid in ids_raw:
            check = myid + args.postfix + args.wav_extension
            if os.path.isfile(check):
                ids.append(myid)
            elif os.path.isfile(myid + '_Kinect-Beam' + args.wav_extension):
                ids.append(myid)
            else:
                print 'Warning, omitting', myid, 'because I can\'t find', check
Exemplo n.º 3
0
if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Prepares the files from the TUDA corpus (XML) into text transcriptions for KALDI')
    parser.add_argument('-f', '--filelist', dest='filelist', help='process this file list', type=str, default = '')
    parser.add_argument('-r', '--remove_extension', dest='remove_extension', help='remove this extension, to get plain file id', type=str, default='.xml')
    parser.add_argument('-w', '--audio-file-extension', dest='wav_extension', help='extension for audio files', type=str, default='.wav')
    parser.add_argument('-p', '--utterance-postfix-name', dest='postfix', help='--utterance-postfix-name', type=str, default='_Kinect-Beam')

    args = parser.parse_args()

    if args.filelist == '':
        print 'Corpus filelist is empty. Use -f to supply a filelist!'
    else:

        print 'Load ', args.filelist, ', ommit ', args.remove_extension

        ids_raw = common_utils.loadIdFile(args.filelist,remove_extension=args.remove_extension)
        print 'I have',len(ids_raw),'files. Some may have their audio missing, I\'ll check that for you...'
        ids = []
        #check for missing wav files:
       
        omitted = 0
        for myid in ids_raw:
            check = myid+args.postfix+args.wav_extension
            if os.path.isfile(check):
                ids.append(myid)
            else:
                print 'Warning, omitting',myid,'because I can\'t find',check
                omitted += 1
        
        print 'Found',len(ids),' wav files.'
        print 'Omitted ',omitted,' xml transcription files (Some missing files is normal for the TUDA Kaldi corpus).'
Exemplo n.º 4
0
    parser = argparse.ArgumentParser(description='Prepares various sources of pronounciations and builds a lexicon that can be exported to KALDI')
    parser.add_argument('-f', '--filelist', dest='filelist', help='Process this file list of lexicons', type=str, default = '')
    parser.add_argument('-s', '--single-file', dest='singlefile', help='Process this single lexicon file', type=str, default = '')
    parser.add_argument('-e', '--export-pickle', dest='export', help='Export pickle file of combined phoneme dictionary', type=str, default = '')
    parser.add_argument('-d', '--export-dir', dest='export_dir', help='Export dir for nonsilence_phones.txt, silence_phones.txt and extra_questions.txt' , type=str, default='data/local/dict/')

    args = parser.parse_args()

    if(args.singlefile != ''):
        ids = [args.singlefile]
    else:
        if args.filelist == '':
            print 'No files specified for processing!'
            sys.exit()
        ids = common_utils.loadIdFile(args.filelist)

    combinedDict = {}
    
    for myid in ids:
        print "I'm now opening ", myid
        importer = guessImportFunc(myid)
        d = importer(myid)
        combinedDict = merge_dicts(combinedDict, d)

    variants = 0
    for key in sorted(combinedDict.iterkeys()):
        #print 'Word:',key,combinedDict[key]
        variants += len(combinedDict[key])
        
    print 'Dictionary size is ', len(combinedDict), ' pronounciation variants ', variants