def process_ted(csv_file, category):

    parent_path = _data_path + 'TEDLIUM_release2/' + category + '/'
    labels, wave_files, offsets, durs = [], [], [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read STM file list
    stm_list = glob.glob(parent_path + 'stm/*')
    for stm in stm_list:
        with open(stm, 'rt') as f:
            records = f.readlines()
            for record in records:
                field = record.split()

                # wave file name
                wave_file = parent_path + 'sph/%s.sph.wav' % field[0]
                wave_files.append(wave_file)

                # label index
                labels.append(data.str2index(' '.join(field[6:])))

                # start, end info
                start, end = float(field[3]), float(field[4])
                offsets.append(start)
                durs.append(end - start)

    # save results
    for i, (wave_file, label, offset,
            dur) in enumerate(zip(wave_files, labels, offsets, durs)):

        # print info
        print("TEDLIUM corpus preprocessing (%d / %d) - '%s-%.2f]" %
              (i, len(wave_files), wave_file, offset))

        # load wave file
        wave, sr = librosa.load(wave_file,
                                mono=True,
                                sr=None,
                                offset=offset,
                                duration=dur)

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename
            fn = "%s-%.2f" % (wave_file.split('/')[-1], offset)

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save('asset/data/preprocess/mfcc/' + fn + '.npy',
                    mfcc,
                    allow_pickle=False)
def process_ted(csv_file, category):

    parent_path = _data_path + 'TEDLIUM_release2/' + category + '/'
    labels, wave_files, offsets, durs = [], [], [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read STM file list
    stm_list = glob.glob(parent_path + 'stm/*')
    for stm in stm_list:
        with open(stm, 'rt') as f:
            records = f.readlines()
            for record in records:
                field = record.split()

                # wave file name
                wave_file = parent_path + 'sph/%s.sph.wav' % field[0]
                wave_files.append(wave_file)

                # label index
                labels.append(data.str2index(' '.join(field[6:])))

                # start, end info
                start, end = float(field[3]), float(field[4])
                offsets.append(start)
                durs.append(end - start)

    # save results
    for i, (wave_file, label, offset, dur) in enumerate(zip(wave_files, labels, offsets, durs)):
        fn = "%s-%.2f" % (wave_file.split('/')[-1], offset)
        target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists( target_filename ):
            continue
        # print info
        print("TEDLIUM corpus preprocessing (%d / %d) - '%s-%.2f]" % (i, len(wave_files), wave_file, offset))
        # load wave file
        if not os.path.exists( wave_file ):
            sph_file = wave_file.rsplit('.',1)[0]
            if os.path.exists( sph_file ):
                convert_sph( sph_file, wave_file )
            else:
                raise RuntimeError("Missing sph file from TedLium corpus at %s"%(sph_file))
        wave, sr = librosa.load(wave_file, mono=True, sr=None, offset=offset, duration=dur)

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
示例#3
0
def save_recording(wave_file_name, wave, sentence, csv_file_name):
    mfcc = librosa.feature.mfcc(wave, sr=16000)
    time_id = strftime("%d_%b_%Y_%H_%M_%S", gmtime())
    target_file_name = wave_file_name.split(".")[0] + time_id + ".npy"
    print sentence
    label = " ".join(map(str, data.str2index(unidecode(sentence))))
    file_csv = open(csv_file_name, "a")
    file_csv.write(",".join([target_file_name, label]) + "\n")
    file_csv.close()
    np.save(FOLDER_MFCC + "/" + target_file_name, mfcc, allow_pickle=False)
示例#4
0
def process_commonvoice(csv_file, category):

    parent_path = _data_path + 'cv_corpus_v1/'
    labels, wave_files = [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    with open(parent_path+category+'.csv') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader: # filename,text,up_votes,down_votes,age,gender,accent,duration
            wave_file = parent_path + '/' + row['filename'] + '.wav'
            wave_files.append(wave_file)
            labels.append(data.str2index(row['text']))

    # save results
    count = 0 
    f = 0
    s = 0
    for i, (wave_file, label) in enumerate(zip(wave_files, labels)):
        try:
            fn = wave_file.replace('/', '-')
            target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
            if os.path.exists( target_filename ):
                continue
            # print info
            print("CommonVoice corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file))

            wave, sr = librosa.load(wave_file, mono=True, sr=None)

            # re-sample ( 48K -> 16K )
            wave = wave[::3]

            # get mfcc feature
            mfcc = librosa.feature.mfcc(wave, sr=16000)
        except Exception, e:
            f += 1
            print 'Failed for ', wave_file
            continue
            
        

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
            count+=1
        else:
            s+= 1
def process_vctk(csv_file):

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read label-info
    df = pd.read_table(_data_path + 'VCTK-Corpus/speaker-info.txt',
                       usecols=['ID'],
                       index_col=False,
                       delim_whitespace=True)

    # read file IDs
    file_ids = []
    for d in [
            _data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values
    ]:
        file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))])

    for i, f in enumerate(file_ids):

        # wave file name
        wave_file = _data_path + 'VCTK-Corpus/wav48/%s/' % f[:4] + f + '.wav'

        # print info
        print("VCTK corpus preprocessing (%d / %d) - '%s']" %
              (i, len(file_ids), wave_file))

        # load wave file
        wave, sr = librosa.load(wave_file, mono=True, sr=None)

        # re-sample ( 48K -> 16K )
        wave = wave[::3]

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # get label index
        label = data.str2index(
            open(_data_path + 'VCTK-Corpus/txt/%s/' % f[:4] + f +
                 '.txt').read())

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename
            fn = wave_file.split('/')[-1]

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save('asset/data/preprocess/mfcc/' + fn + '.npy',
                    mfcc,
                    allow_pickle=False)
def process_new_data(csv_file, category):

    parent_path = _data_path + 'FINAL_DATA/' + category + '/'
    labels, wave_files = [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read label text file list
    f = open(parent_path + 'text.txt', 'rt')
    records = f.readlines()
    for record in records:
        # parsing record
        field = record.split('|')  # split by '|'

        if (field[0] + '.wav' in os.listdir(parent_path + 'audio/')):
            # wave file name
            wave_file = parent_path + 'audio/' + '%s.wav' % field[0]
            wave_files.append(wave_file)

            # label index
            labels.append(data.str2index(
                field[1]))  # last column is text label

    f.close()

    # save results
    for i, (wave_file, label) in enumerate(zip(wave_files, labels)):
        fn = wave_file.split('/')[-1]
        target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists(target_filename):
            continue
        # print info
        print("new_data corpus preprocessing (%d / %d) - '%s']" %
              (i, len(wave_files), wave_file))

        # load wave file
        wave, sr = librosa.load(wave_file, mono=True, sr=None)

        # re-sample ( 48K -> 16K )
        # wave = wave[::3]

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # save meta info
            writer.writerow([fn] + label)
            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
示例#7
0
def process_oneword(csv_file):

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read label-info
    #df = pd.read_table(_data_path + 'VCTK-Corpus/speaker-info.txt', usecols=['ID'],
    #                   index_col=False, delim_whitespace=True)

    oneword_dir = "../audios"
    file_ids = glob.glob(os.path.join(oneword_dir, "*.wav"))

    # read file IDs
    #file_ids = []
    #for d in [_data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values]:
    #    file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))])

    for i, wave_file in enumerate(file_ids):

        # wave file name
        #wave_file = _data_path + 'VCTK-Corpus/wav48/%s/' % f[:4] + f + '.wav'
        fn = wave_file.split('/')[-1]

        target_filename = 'asset/data/preprocess/mfcc-one/' + fn + '.npy'
        if os.path.exists(target_filename):
            continue
        # print info
        print("One word corpus preprocessing (%d / %d) - '%s']" %
              (i, len(file_ids), wave_file))

        # load wave file
        wave, sr = librosa.load(wave_file, mono=True, sr=None)

        # re-sample ( 48K -> 16K )
        wave = wave[::3]

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # get label index
        label_fn = fn.split(".")[0]
        label = data.str2index(
            open('../audios_labels/%s' % (label_fn + '.txt')).read())

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # save meta info
            writer.writerow([fn] + label)
            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
示例#8
0
def process_tidigits(csv_file):

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read label-info
    #df = pd.read_table(_data_path + 'TIDIGITS/spkrinfo.txt', usecols=['ID'],
    #                   index_col=False, delim_whitespace=True)

    # read file IDs just for woman ac
    file_ids = []
    #for d in [_data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values]:
    #    file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))])

    d = _data_path + 'TIDIGITS/train/woman/ac/'
    file_ids.extend([f for f in sorted(glob.glob(d + '*.wav'))])

    for i, wave_file in enumerate(file_ids):

        # wave file name
        fn = wave_file.split('/')[-1]
        target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists(target_filename):
            continue
        # print info
        print("TIDIGITS corpus preprocessing (%d / %d) - '%s']" %
              (i, len(file_ids), wave_file))

        # load wave file
        wave, sr = librosa.load(wave_file, mono=True, sr=None)

        # re-sample ( 48K -> 16K )
        wave = wave[::3]

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # get label index
        gold_text = re.sub(".wav", "", wave_file)
        label = data.str2index(gold_text)

        # save result ( exclude small mfcc data to prevent ctc loss )
        #if len(label) < mfcc.shape[1]:
        # save meta info
        writer.writerow([fn] + label)
        # save mfcc
        np.save(target_filename, mfcc, allow_pickle=False)
def process_vctk(csv_file):

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read label-info
    df = pd.read_table(_data_path + 'VCTK-Corpus/speaker-info.txt', usecols=['ID'],
                       index_col=False, delim_whitespace=True)

    # read file IDs
    file_ids = []
    for d in [_data_path + 'VCTK-Corpus/txt/p%d/' % uid for uid in df.ID.values]:
        file_ids.extend([f[-12:-4] for f in sorted(glob.glob(d + '*.txt'))])

    for i, f in enumerate(file_ids):

        # wave file name
        wave_file = _data_path + 'VCTK-Corpus/wav48/%s/' % f[:4] + f + '.wav'
        fn = wave_file.split('/')[-1]
        target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists( target_filename ):
            continue
        # print info
        print("VCTK corpus preprocessing (%d / %d) - '%s']" % (i, len(file_ids), wave_file))

        # load wave file
        wave, sr = librosa.load(wave_file, mono=True, sr=None)

        # re-sample ( 48K -> 16K )
        wave = wave[::3]

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # get label index
        label = data.str2index(open(_data_path + 'VCTK-Corpus/txt/%s/' % f[:4] + f + '.txt').read())

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # save meta info
            writer.writerow([fn] + label)
            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
示例#10
0
def process_libri(csv_file, category):

    parent_path = _data_path + 'LibriSpeech/' + category + '/'
    labels, wave_files = [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read directory list by speaker
    speaker_list = glob.glob(parent_path + '*')
    for spk in speaker_list:

        # read directory list by chapter
        chapter_list = glob.glob(spk + '/*/')
        for chap in chapter_list:

            # read label text file list
            txt_list = glob.glob(chap + '/*.txt')
            for txt in txt_list:
                with open(txt, 'rt') as f:
                    records = f.readlines()
                    for record in records:
                        # parsing record
                        field = record.split('-')  # split by '-'
                        speaker = field[0]
                        chapter = field[1]
                        field = field[2].split()  # split field[2] by ' '
                        utterance = field[0]  # first column is utterance id

                        # wave file name
                        wave_file = parent_path + '%s/%s/%s-%s-%s.flac' % \
                                                  (speaker, chapter, speaker, chapter, utterance)
                        wave_files.append(wave_file)

                        # label index
                        labels.append(data.str2index(' '.join(
                            field[1:])))  # last column is text label

    # save results
    for i, (wave_file, label) in enumerate(zip(wave_files, labels)):
        fn = wave_file.split('/')[-1]
        target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists(target_filename):
            continue
        # print info
        print("LibriSpeech corpus preprocessing (%d / %d) - '%s']" %
              (i, len(wave_files), wave_file))

        # load flac file
        wave, sr, _ = scikits.audiolab.flacread(wave_file)

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
示例#11
0
def process_libri(csv_file, category):

    parent_path = _data_path + 'LibriSpeech/' + category + '/'
    labels, wave_files = [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read directory list by speaker
    speaker_list = glob.glob(parent_path + '*')
    for spk in speaker_list:

        # read directory list by chapter
        chapter_list = glob.glob(spk + '/*/')
        for chap in chapter_list:

            # read label text file list
            txt_list = glob.glob(chap + '/*.txt')
            for txt in txt_list:
                with open(txt, 'rt') as f:
                    records = f.readlines()
                    for record in records:
                        # parsing record
                        field = record.split('-')  # split by '-'
                        speaker = field[0]
                        chapter = field[1]
                        field = field[2].split()  # split field[2] by ' '
                        utterance = field[0]  # first column is utterance id

                        # wave file name
                        wave_file = parent_path + '%s/%s/%s-%s-%s.flac' % \
                                                  (speaker, chapter, speaker, chapter, utterance)
                        wave_files.append(wave_file)

                        # label index
                        labels.append(data.str2index(' '.join(field[1:])))  # last column is text label

    # save results
    for i, (wave_file, label) in enumerate(zip(wave_files, labels)):
        fn = wave_file.split('/')[-1]
        target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
        if os.path.exists( target_filename ):
            continue
        # print info
        print("LibriSpeech corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file))

        # load flac file
        wave, sr, _ = scikits.audiolab.flacread(wave_file)

        # get mfcc feature
        mfcc = librosa.feature.mfcc(wave, sr=16000)

        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # filename

            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
示例#12
0
def process_voxforge(csv_file):

    parent_path = _data_path + 'voxforge_corpus/'
    labels, wave_files = [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    speaker_list = glob.glob(parent_path + '*')
    for spk in speaker_list:
        prompts_file = spk + '/' + 'etc/prompts-original'
        
        if not os.path.exists( prompts_file ):
            prompts_file = spk + '/' + 'etc/prompts.txt'
            if not os.path.exists( prompts_file ):
                continue
            
        with open(prompts_file, 'rt') as f:
            records = f.readlines()
            for record in records:
                try:
                    field = record.split()  # split field[2] by ' '

                    wave_file = spk + '/' + 'wav/' + field[0] + '.wav'
                    wave_files.append(wave_file) # adding to list of file paths

                    # label index
                    labels.append(data.str2index(' '.join(field[1:])))  # last column is text label
                except:
                    continue

    # save results
    count = 0
    f = 0
    s = 0
    for i, (wave_file, label) in enumerate(zip(wave_files, labels)):
        try:
            fn = wave_file.replace('/', '-')
            target_filename = 'asset/data/preprocess/mfcc/' + fn + '.npy'
            if os.path.exists( target_filename ):
                continue
            # print info
            print("VoxForge corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file))

            wave, sr = librosa.load(wave_file, mono=True, sr=None)

            # re-sample ( 48K -> 16K )
            wave = wave[::3]

            # get mfcc feature
            mfcc = librosa.feature.mfcc(wave, sr=16000)
        except Exception, e:
            print 'failed for ', wave_file, e
            f+=1
            continue
        
        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc.shape[1]:
            # save meta info
            writer.writerow([fn] + label)
            # save mfcc
            np.save(target_filename, mfcc, allow_pickle=False)
            count+= 1
        else:
            s+=1
def process_libri(csv_file, category):

    parent_path = _data_path + category + '/'
    labels, wave_files = [], []

    # create csv writer
    writer = csv.writer(csv_file, delimiter=',')

    # read directory list by speaker
    speaker_list = glob.glob(parent_path + '*')
    for spk in speaker_list:

        # read directory list by chapter
        chapter_list = glob.glob(spk + '/*/')
        for chap in chapter_list:

            # read label text file list
            txt_list = glob.glob(chap + '/*.txt')
            for txt in txt_list:
                with open(txt, 'rt') as f:
                    records = f.readlines()
                    for record in records:
                        # parsing record
                        field = record.split('-')  # split by '-'
                        speaker = field[0]
                        chapter = field[1]
                        field = field[2].split()  # split field[2] by ' '
                        utterance = field[0]  # first column is utterance id

                        # wave file name
                        wave_file = parent_path + '%s/%s/%s-%s-%s.flac' % \
                                                  (speaker, chapter, speaker, chapter, utterance)
                        wave_files.append(wave_file)

                        # label index
                        labels.append(data.str2index(' '.join(field[1:])))  # last column is text label

    # save results
    for i, (wave_file, label) in enumerate(zip(wave_files, labels)):
        fn = wave_file.split('/')[-1] #extract file name
        target_filename = _root_path + 'preprocess/mfcc/' + fn + '.npy'
        

        if os.path.exists( target_filename ):
            #print 'continue.'
            continue

        
        # print info
        print("LibriSpeech corpus preprocessing (%d / %d) - '%s']" % (i, len(wave_files), wave_file))

        # load flac file
        wave, sr, _ = scikits.audiolab.flacread(wave_file)

        # get mfcc feature, default 20 mfcc features, return np.ndarray [shape=(n_mfcc=20, t)], where t is the number of frames.
        #mfcc = librosa.feature.mfcc(wave, sr=16000)
        
        n_fft = 400 #16000*0.025 #25ms
        hop_length = 160 #16000*0.01
        """
        return np.ndarray [shape=(n_mfcc=20, t)], where t is the number of frames
        40ms per frame(window length) with 10ms stride
        t = sec_of_samples*sample_rate/hop_length
        """
        mfcc_total = [] 
        mfcc = librosa.feature.mfcc(wave, sr=16000, n_fft=n_fft, hop_length=hop_length, n_mfcc=13)
        mfcc_delta = librosa.feature.delta(mfcc)
        mfcc_delta2 = librosa.feature.delta(mfcc, order=2)
        #mfcc = librosa.feature.melspectrogram(wave, sr=16000, n_fft=n_fft, hop_length=hop_length, n_mels=128)
        
        
        mfcc_total.append(mfcc)
        mfcc_total.append(mfcc_delta)
        mfcc_total.append(mfcc_delta2)
        
        mfcc = np.asarray(mfcc_total) #size: 3*13*fea_len
        
        mfcc_ = np.transpose(mfcc,axes=[2,1,0]) #size: fea_len*13*3
        #mfcc_ = mfcc_[:,1:]
        
        #do normalization
        """
        """
        mean = np.mean(mfcc_)
        std  = np.std(mfcc_)
        mfcc_ = (mfcc_- mean)/std


        # save result ( exclude small mfcc data to prevent ctc loss )
        if len(label) < mfcc_.shape[0]:
            """if len(label) > mfcc.shape[1], meaning that there has at least two characters in 10ms(hop_length) and we can not separate it."""
            # filename
            # save meta info
            writer.writerow([fn] + label)

            # save mfcc
            np.save(target_filename, mfcc_, allow_pickle=False)
示例#14
0
decoded, _ = tf.nn.ctc_beam_search_decoder(logit.sg_transpose(perm=[1, 0, 2]),
                                           seq_len,
                                           merge_repeated=False)

# to dense tensor
pred = tf.sparse_to_dense(decoded[0].indices, decoded[0].dense_shape,
                          decoded[0].values) + 1

targ = tf.placeholder(dtype=tf.int32,
                      shape=(1, None))  #corpus.label.shape)#corpus.label
loss = logit.sg_ctc(target=targ, seq_len=seq_len)

opt = tf.train.GradientDescentOptimizer(learning_rate=lr)
optimizer = opt.minimize(loss, var_list=(noise, ))

new_target = np.array(str2index(fool))

# run network
with tf.Session() as sess:

    # init variables
    tf.sg_init(sess)

    all_vars = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES) + \
          tf.get_collection(tf.GraphKeys.SAVEABLE_OBJECTS)

    vars_to_train = [el for el in all_vars if 'noise' not in el.name]

    # restore parameters
    saver = tf.train.Saver(vars_to_train)
    saver.restore(sess, tf.train.latest_checkpoint('asset/train'))